My current project involves data scraping from a real estate website (). I've been able to extract all the listing information successfully, except for the images (image links/src). Essentially, after a few images, what I get is just garbage. I delved into research and discovered that this issue arises due to lazy loading. I've attempted various methods suggested by others, such as scrolling to the bottom of the page, implementing delays while scrolling (https://www.npmjs.com/package/puppeteer-autoscroll-down), and even zooming out the browser to trigger image rendering. Unfortunately, none of these solutions seem to work. After spending hours scouring the web for answers, I've decided to turn to the community here and share my code in hopes that someone can help me unravel this puzzle.
let cheerio = require('cheerio')
let puppeteer = require('puppeteer-extra')
const pluginStealth = require("puppeteer-extra-plugin-stealth")
puppeteer.use(pluginStealth())
let userAgent = require('random-useragent')
const baseURL = "https://www.zillow.com/vancouver-bc"
let estateData = []
let urlLinks = []
let scrollPageToBottom = require('puppeteer-autoscroll-down')
let getEstateData = async () => {
estateData = []
urlLinks = []
let url
for (let pgNum = 1; pgNum <= 1; pgNum++) {
if (pgNum === 1) {
url = baseURL + "/"
} else {
url = baseURL + ("/" + pgNum + "_p")
}
urlLinks.push(url)
}
await searchWebsite()
console.log("search over")
return estateData
//module.exports = estateData
}
let searchWebsite = async () => {
await puppeteer
.launch({headless : false})
.then(async function (browser) {
let page = await browser.newPage();
// await page.setRequestInterception(true)
//
// page.on('request', (req) => {
// if( req.resourceType() === 'image' || req.resourceType() === 'stylesheet' || req.resourceType() === 'font'){
// req.abort()
// }
// else {
// req.continue()
// }
//
// })
let html
await page.setUserAgent(userAgent.getRandom())
for(let url of urlLinks){
console.log(url)
await page.goto(url).then(async function () {
html = await page.content();
let obj = await cheerio('.list-card-link.list-card-info', html)
let imgObj = await cheerio(".list-card-top", html)
let geoLocation = await cheerio(".photo-cards.photo-cards_wow", html)
// await page.waitForSelector('img',{
// visible: true,
// })
// await page.evaluate(() => { window.scrollTo(0, document.body.scrollHeight)})
const scrollStep = 250 // default
const scrollDelay = 100 // default
const lastPosition = await scrollPageToBottom(page, scrollStep, scrollDelay)
await page.waitFor(2000)
let num = 0
console.log(obj.length)
for (let key in obj) {
if (obj[key].attribs) {
try {
let geoStr = await geoLocation[0].children[0].children[0].children[0].data
let geoObj = await (JSON.parse(geoStr)["geo"])
let extractedInfo = {
estateName : await obj[key].children[0].children[0].data,
estatePrice : await obj[key].children[2].children[0].children[0].data,
saleType : await obj[key].children[1].children[0].next.data,
estateConfig : {
beds : await obj[key].children[2].children[1].children[0].children[0].data,
bath : await obj[key].children[2].children[1].children[1].children[0].data,
area : await obj[key].children[2].children[1].children[2].children[0].data
},
estateLocation : {
longitude : await geoObj.longitude,
latitude : await geoObj.latitude
},
estateLink : await obj[key].attribs.href,
estateCoverImgLink : await imgObj[num++].children[2].children[0].attribs.src
}
console.log(extractedInfo.estateName, imgObj[num].children[2].children[0].attribs.src)
await estateData.push(extractedInfo)
}
catch (e) {
console.log("Estate Skipped - ", obj[key].children[0].children[0].data, obj[key].attribs.href)
console.log(e)
}
}
}
console.log(estateData.length)
});
}
console.log("total - ", estateData.length)
await page.close()
await browser.close()
})
.catch(function (err) {
console.log(err)
});
}
module.exports.getEstateData = getEstateData