Skip to content
Advertisement

Pagination when there is no “next page” button and url doesn’t change

I’m trying to scrape https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1 with Puppeteer and Node.js

The scraper works but only gets data from the first page as I don’t know how to paginate. The issue is that the URL doesn’t change depending on the page number, and there is no “next page” button.

How can I implement pagination with such constraints?

Below is my entire code:

let browser;
const ventureLoopResults = [];
const url =
  "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1";
// "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1";

async function scrapeJobsInIndexPage(url) {
  try {
    // const browser = await puppeteer.launch({ headless: false });
    const page = await browser.newPage();
    await page.goto("https://www.ventureloop.com/ventureloop/login.php", {
      waitUntil: "networkidle0",
    });
    await page.click("#close-cookies", {
      delay: 200,
    });
    await page.type("[name='email_1']", "natan.chapman@gmail.com", {
      delay: 200,
    });
    await page.type("[name='pass']", "Aw8rbJ!9bXt*dpb", { delay: 200 });
    await page.click("#formContainer > form > div:nth-child(5) > input", {
      delay: 200,
    });

    await page.waitForNavigation();
    await page.goto(url, { waitUntil: "networkidle0" });
    // await page.goto(ls
    //   "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1",
    //   { waitUntil: "networkidle0" });
    // await page.waitForNavigation();

    const html = await page.evaluate(() => document.body.innerHTML);
    // const content = await page.content();
    const $ = await cheerio.load(html);
    // const $ = await cheerio.load(content);

    // const homes = $("[itemprop='url']")

    const jobs = $(".tsize a:even")
      // const jobs = $(".tsize a:even")
      .map(
        (i, element) =>
          "https://www.ventureloop.com/ventureloop/" + $(element).attr("href")
      )
      .get();
    console.log(jobs);
    return jobs;
  } catch (erro) {
    console.error(erro);
  }
}

async function scrapeDescriptionPage(url, page) {
  let jobText;

  try {
    await page.goto(url, { waitUntil: "networkidle0" });
    const html = await page.evaluate(() => document.body.innerHTML);
    const $ = await cheerio.load(html);
    jobText = $("#formContainer").text();

    const companyImage = await page.$eval(
      // ".oc-photo-gallery .photo__10vsfGte img",
      ".cs-media img",
      (img) => img.src
    );
    const location = $(".location.mid").text();
    // const jobSalary = $(".css-1v5elnn.e11nt52q2 .small.css-10zcshf.e1v3ed7e1").text()
    const jobPosition = $(".cs-post-title h2").text();
    const companyName = $(".cs-post-title h3").text();
    const applyLinkRedirect = $(".ltp-btn").attr("href");
    // const jobDescription = $(".company-detail").html();
    const jobDescription = $(
      "#formContainer > form > div > div > div.company-detail > div:nth-child(3)"
    ).html();
    const datePosted = $(
      "#formContainer > form > div > div > div.company-detail > ul > li:nth-child(3) > span"
    ).text();

    await page.goto(applyLinkRedirect, { waitUntil: "networkidle0" });
    const applyLink = await page.url();

    let ventureLoopResult = new testVentureLoopDB({
      url,
      jobPosition,
      companyName,
      applyLink,
      jobDescription,
      companyImage,
      datePosted,
      //   jobSalary,
      location,
    });
    ventureLoopResults.push(ventureLoopResult);
    console.log(ventureLoopResults);
    ventureLoopResult.save();
    const listingModel = new GlassdoorDB(ventureLoopResult);
    // const listingModel = new VentureLoopDB(ventureLoopResult);
    // const listingModel = new VentureLoopDB(ventureLoopResult);
    await listingModel.save();
    
    
      console.log(result);
    }
    return ventureLoopResults;

    // while (await page.$("[data-test='pagination-next']")) {

    //   await page.click("[data-test='pagination-next']");
    // }
  } catch (err) {
    console.log(jobText);
    console.log(url);
    console.log(err);
  }
}

async function main() {
  await connectToMongoDb();
  browser = await puppeteer.launch({ headless: false });
  const descriptionPage = await browser.newPage();
  const jobs = await scrapeJobsInIndexPage(
    "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1"
  );
  for (var i = 1; i < jobs.length; i++) {
    const result = await scrapeDescriptionPage(jobs[i], descriptionPage);
    // const datePosted = await scrapeDescriptionPage(jobs[i], descriptionPage);
    console.log(result);
  }
}

main();

Advertisement

Answer

Try something like this:

import puppeteer from 'puppeteer';

const browser = await puppeteer.launch({ headless: false, defaultViewport: null });

try {
  const [page] = await browser.pages();

  await page.goto('https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1');

  const totalPagesSelector = '.pag_txt_tot';
  const currentPageSelector = '.pag_txt_current';

  await page.waitForSelector(totalPagesSelector);

  const totalPages = await page.$eval(totalPagesSelector, el => Number(el.innerText));

  for (let currentPage = 1; currentPage <= totalPages; currentPage++) {
    await page.waitForFunction(
      (sel, page) => document.querySelector(sel)?.innerText === String(page),
      {},
      currentPageSelector,
      currentPage,
    );
    const data = await page.evaluate(() => {
      const firstDataCell = document.querySelector('#news_tbl tr td')?.innerText;
      return firstDataCell;
    });
    console.log(`${currentPage}: ${data}`);

    await page.evaluate(() => {
      document.querySelector('span.current').nextElementSibling?.querySelector('a').click();
    });
  }

} catch (err) { console.error(err); } finally { await browser.close(); }
User contributions licensed under: CC BY-SA
9 People found this is helpful
Advertisement