I’m trying to scrape https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1 with Puppeteer and Node.js
The scraper works but only gets data from the first page as I don’t know how to paginate. The issue is that the URL doesn’t change depending on the page number, and there is no “next page” button.
How can I implement pagination with such constraints?
Below is my entire code:
JavaScript
x
let browser;
const ventureLoopResults = [];
const url =
"https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1";
// "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1";
async function scrapeJobsInIndexPage(url) {
try {
// const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://www.ventureloop.com/ventureloop/login.php", {
waitUntil: "networkidle0",
});
await page.click("#close-cookies", {
delay: 200,
});
await page.type("[name='email_1']", "natan.chapman@gmail.com", {
delay: 200,
});
await page.type("[name='pass']", "Aw8rbJ!9bXt*dpb", { delay: 200 });
await page.click("#formContainer > form > div:nth-child(5) > input", {
delay: 200,
});
await page.waitForNavigation();
await page.goto(url, { waitUntil: "networkidle0" });
// await page.goto(ls
// "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1",
// { waitUntil: "networkidle0" });
// await page.waitForNavigation();
const html = await page.evaluate(() => document.body.innerHTML);
// const content = await page.content();
const $ = await cheerio.load(html);
// const $ = await cheerio.load(content);
// const homes = $("[itemprop='url']")
const jobs = $(".tsize a:even")
// const jobs = $(".tsize a:even")
.map(
(i, element) =>
"https://www.ventureloop.com/ventureloop/" + $(element).attr("href")
)
.get();
console.log(jobs);
return jobs;
} catch (erro) {
console.error(erro);
}
}
async function scrapeDescriptionPage(url, page) {
let jobText;
try {
await page.goto(url, { waitUntil: "networkidle0" });
const html = await page.evaluate(() => document.body.innerHTML);
const $ = await cheerio.load(html);
jobText = $("#formContainer").text();
const companyImage = await page.$eval(
// ".oc-photo-gallery .photo__10vsfGte img",
".cs-media img",
(img) => img.src
);
const location = $(".location.mid").text();
// const jobSalary = $(".css-1v5elnn.e11nt52q2 .small.css-10zcshf.e1v3ed7e1").text()
const jobPosition = $(".cs-post-title h2").text();
const companyName = $(".cs-post-title h3").text();
const applyLinkRedirect = $(".ltp-btn").attr("href");
// const jobDescription = $(".company-detail").html();
const jobDescription = $(
"#formContainer > form > div > div > div.company-detail > div:nth-child(3)"
).html();
const datePosted = $(
"#formContainer > form > div > div > div.company-detail > ul > li:nth-child(3) > span"
).text();
await page.goto(applyLinkRedirect, { waitUntil: "networkidle0" });
const applyLink = await page.url();
let ventureLoopResult = new testVentureLoopDB({
url,
jobPosition,
companyName,
applyLink,
jobDescription,
companyImage,
datePosted,
// jobSalary,
location,
});
ventureLoopResults.push(ventureLoopResult);
console.log(ventureLoopResults);
ventureLoopResult.save();
const listingModel = new GlassdoorDB(ventureLoopResult);
// const listingModel = new VentureLoopDB(ventureLoopResult);
// const listingModel = new VentureLoopDB(ventureLoopResult);
await listingModel.save();
console.log(result);
}
return ventureLoopResults;
// while (await page.$("[data-test='pagination-next']")) {
// await page.click("[data-test='pagination-next']");
// }
} catch (err) {
console.log(jobText);
console.log(url);
console.log(err);
}
}
async function main() {
await connectToMongoDb();
browser = await puppeteer.launch({ headless: false });
const descriptionPage = await browser.newPage();
const jobs = await scrapeJobsInIndexPage(
"https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1"
);
for (var i = 1; i < jobs.length; i++) {
const result = await scrapeDescriptionPage(jobs[i], descriptionPage);
// const datePosted = await scrapeDescriptionPage(jobs[i], descriptionPage);
console.log(result);
}
}
main();
Advertisement
Answer
Try something like this:
JavaScript
import puppeteer from 'puppeteer';
const browser = await puppeteer.launch({ headless: false, defaultViewport: null });
try {
const [page] = await browser.pages();
await page.goto('https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1');
const totalPagesSelector = '.pag_txt_tot';
const currentPageSelector = '.pag_txt_current';
await page.waitForSelector(totalPagesSelector);
const totalPages = await page.$eval(totalPagesSelector, el => Number(el.innerText));
for (let currentPage = 1; currentPage <= totalPages; currentPage++) {
await page.waitForFunction(
(sel, page) => document.querySelector(sel)?.innerText === String(page),
{},
currentPageSelector,
currentPage,
);
const data = await page.evaluate(() => {
const firstDataCell = document.querySelector('#news_tbl tr td')?.innerText;
return firstDataCell;
});
console.log(`${currentPage}: ${data}`);
await page.evaluate(() => {
document.querySelector('span.current').nextElementSibling?.querySelector('a').click();
});
}
} catch (err) { console.error(err); } finally { await browser.close(); }