Below is the code snippet from my data scraping file:
const puppeteer = require('puppeteer');
const db = require('../db');
const Job = require('../models/job');
(async() => {
try {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
// args: ['--no-zygote', '--no-sandbox']
});
const url = 'https://www.linkedin.com/jobs/search?keywords=Junior%20Software%20Developer&location=Indianapolis%2C%20IN&geoId=&trk=homepage-jobseeker_jobs-search-bar_search-submit&position=1&pageNum=0';
// Initiate a new page in the browser
const page = await browser.newPage({
waitUntil: 'networkidle0'
});
console.log(`Navigating to ${url}`);
await page.goto(url);
// Scroll to the bottom of the page, click on 'See More Jobs', and repeat
let lastHeight = await page.evaluate('document.body.scrollHeight');
const scroll = async() => {
while (true) {
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForTimeout(2000);
let newHeight = await page.evaluate('document.body.scrollHeight');
if (newHeight === lastHeight) {
console.log('Done scrolling!');
break;
}
lastHeight = newHeight;
seeMoreJobs();
}
console.log(data);
}
// Click on 'See More Jobs'
const seeMoreJobs = async() => {
await page.evaluate(() => {
document.querySelector('button[data-tracking-control-name="infinite-scroller_show-more"]').click();
});
}
// Fetch and collect data
const data = await page.evaluate(() => {
const allJobsArr = Array.from(document.querySelectorAll('a[data-tracking-control-name="public_jobs_jserp-result_search-card"]'));
const namesAndUrls = allJobsArr.map(job => {
return {
name: job.innerText,
url: job.href,
path: job.pathname
}
});
return namesAndUrls;
});
scroll();
} catch (err) {
console.log(err);
}
})();
The above script aims to open the specified url
, then continuously scroll until reaching the end of the page. After completing these actions, I intend to output an array containing three properties for each job listing: name
, href
, and path
. While running the Immediate Invoked Function Expression (IIFE), I can scrape the initial 24-25 job postings displayed before any scrolling occurs.
- However, the issue arises when attempting to evaluate the entire page or document after all the scrolling is completed using the
data
function.
I have made several attempts and thoroughly analyzed the script's behavior, yet I am unable to find a solution. My ultimate objective is to iterate through every job posting visible after scrolling and log all the retrieved data with the desired properties to the console, not limiting to the first 24-25 results.
Appreciate any assistance provided.