I’m trying to scrape https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1 with Puppeteer and Node.js
The scraper works but only gets data from the first page as I don’t know how to paginate. The issue is that the URL doesn’t change depending on the page number, and there is no “next page” button.
How can I implement pagination with such constraints?
Below is my entire code:
let browser; const ventureLoopResults = []; const url = "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1"; // "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1"; async function scrapeJobsInIndexPage(url) { try { // const browser = await puppeteer.launch({ headless: false }); const page = await browser.newPage(); await page.goto("https://www.ventureloop.com/ventureloop/login.php", { waitUntil: "networkidle0", }); await page.click("#close-cookies", { delay: 200, }); await page.type("[name='email_1']", "natan.chapman@gmail.com", { delay: 200, }); await page.type("[name='pass']", "Aw8rbJ!9bXt*dpb", { delay: 200 }); await page.click("#formContainer > form > div:nth-child(5) > input", { delay: 200, }); await page.waitForNavigation(); await page.goto(url, { waitUntil: "networkidle0" }); // await page.goto(ls // "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1", // { waitUntil: "networkidle0" }); // await page.waitForNavigation(); const html = await page.evaluate(() => document.body.innerHTML); // const content = await page.content(); const $ = await cheerio.load(html); // const $ = await cheerio.load(content); // const homes = $("[itemprop='url']") const jobs = $(".tsize a:even") // const jobs = $(".tsize a:even") .map( (i, element) => "https://www.ventureloop.com/ventureloop/" + $(element).attr("href") ) .get(); console.log(jobs); return jobs; } catch (erro) { console.error(erro); } } async function scrapeDescriptionPage(url, page) { let jobText; try { await page.goto(url, { waitUntil: "networkidle0" }); const html = await page.evaluate(() => document.body.innerHTML); const $ = await cheerio.load(html); jobText = $("#formContainer").text(); const companyImage = await page.$eval( // ".oc-photo-gallery .photo__10vsfGte img", ".cs-media img", (img) => img.src ); const location = $(".location.mid").text(); // const jobSalary = $(".css-1v5elnn.e11nt52q2 .small.css-10zcshf.e1v3ed7e1").text() const jobPosition = $(".cs-post-title h2").text(); const companyName = $(".cs-post-title h3").text(); const applyLinkRedirect = $(".ltp-btn").attr("href"); // const jobDescription = $(".company-detail").html(); const jobDescription = $( "#formContainer > form > div > div > div.company-detail > div:nth-child(3)" ).html(); const datePosted = $( "#formContainer > form > div > div > div.company-detail > ul > li:nth-child(3) > span" ).text(); await page.goto(applyLinkRedirect, { waitUntil: "networkidle0" }); const applyLink = await page.url(); let ventureLoopResult = new testVentureLoopDB({ url, jobPosition, companyName, applyLink, jobDescription, companyImage, datePosted, // jobSalary, location, }); ventureLoopResults.push(ventureLoopResult); console.log(ventureLoopResults); ventureLoopResult.save(); const listingModel = new GlassdoorDB(ventureLoopResult); // const listingModel = new VentureLoopDB(ventureLoopResult); // const listingModel = new VentureLoopDB(ventureLoopResult); await listingModel.save(); console.log(result); } return ventureLoopResults; // while (await page.$("[data-test='pagination-next']")) { // await page.click("[data-test='pagination-next']"); // } } catch (err) { console.log(jobText); console.log(url); console.log(err); } } async function main() { await connectToMongoDb(); browser = await puppeteer.launch({ headless: false }); const descriptionPage = await browser.newPage(); const jobs = await scrapeJobsInIndexPage( "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1" ); for (var i = 1; i < jobs.length; i++) { const result = await scrapeDescriptionPage(jobs[i], descriptionPage); // const datePosted = await scrapeDescriptionPage(jobs[i], descriptionPage); console.log(result); } } main();
Advertisement
Answer
Try something like this:
import puppeteer from 'puppeteer'; const browser = await puppeteer.launch({ headless: false, defaultViewport: null }); try { const [page] = await browser.pages(); await page.goto('https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1'); const totalPagesSelector = '.pag_txt_tot'; const currentPageSelector = '.pag_txt_current'; await page.waitForSelector(totalPagesSelector); const totalPages = await page.$eval(totalPagesSelector, el => Number(el.innerText)); for (let currentPage = 1; currentPage <= totalPages; currentPage++) { await page.waitForFunction( (sel, page) => document.querySelector(sel)?.innerText === String(page), {}, currentPageSelector, currentPage, ); const data = await page.evaluate(() => { const firstDataCell = document.querySelector('#news_tbl tr td')?.innerText; return firstDataCell; }); console.log(`${currentPage}: ${data}`); await page.evaluate(() => { document.querySelector('span.current').nextElementSibling?.querySelector('a').click(); }); } } catch (err) { console.error(err); } finally { await browser.close(); }