Puppeteer for optimizing websites with lazy loading image functionality

My current project involves data scraping from a real estate website (). I've been able to extract all the listing information successfully, except for the images (image links/src). Essentially, after a few images, what I get is just garbage. I delved into research and discovered that this issue arises due to lazy loading. I've attempted various methods suggested by others, such as scrolling to the bottom of the page, implementing delays while scrolling (https://www.npmjs.com/package/puppeteer-autoscroll-down), and even zooming out the browser to trigger image rendering. Unfortunately, none of these solutions seem to work. After spending hours scouring the web for answers, I've decided to turn to the community here and share my code in hopes that someone can help me unravel this puzzle.

let cheerio        = require('cheerio')
let puppeteer      = require('puppeteer-extra')
const pluginStealth = require("puppeteer-extra-plugin-stealth")
puppeteer.use(pluginStealth())
let userAgent      = require('random-useragent')
const baseURL      = "https://www.zillow.com/vancouver-bc"
let estateData     = []
let urlLinks       = []

let scrollPageToBottom = require('puppeteer-autoscroll-down')


let getEstateData = async () => {
    estateData = []
    urlLinks   = []
    let url
    for (let pgNum = 1; pgNum <= 1; pgNum++) {
        if (pgNum === 1) {
            url = baseURL + "/"
        } else {
            url = baseURL + ("/" + pgNum + "_p")
        }
        urlLinks.push(url)
    }
    await searchWebsite()
    console.log("search over")
    return estateData
    //module.exports = estateData
}

let searchWebsite = async () => {
    await puppeteer
        .launch({headless : false})
        .then(async function (browser) {
            let page = await browser.newPage();
            // await page.setRequestInterception(true)
            //
            // page.on('request', (req) => {
            //     if( req.resourceType() === 'image' || req.resourceType() === 'stylesheet' || req.resourceType() === 'font'){
            //         req.abort()
            //     }
            //     else {
            //         req.continue()
            //     }
            //
            // })

            let html
            await page.setUserAgent(userAgent.getRandom())
            for(let url of urlLinks){
                console.log(url)
                await page.goto(url).then(async function () {
                    html = await page.content();
                    let obj = await cheerio('.list-card-link.list-card-info', html)
                    let imgObj = await cheerio(".list-card-top", html)
                    let geoLocation = await cheerio(".photo-cards.photo-cards_wow", html)


                    // await page.waitForSelector('img',{
                    //     visible: true,
                    // })
                    // await page.evaluate(() => { window.scrollTo(0, document.body.scrollHeight)})
                    const scrollStep = 250 // default
                    const scrollDelay = 100 // default
                    const lastPosition = await scrollPageToBottom(page, scrollStep, scrollDelay)
                    await page.waitFor(2000)

                    let num = 0
                    console.log(obj.length)
                    for (let key in obj) {
                        if (obj[key].attribs) {
                            try {
                                let geoStr = await geoLocation[0].children[0].children[0].children[0].data
                                let geoObj = await (JSON.parse(geoStr)["geo"])

                                let extractedInfo = {
                                    estateName : await obj[key].children[0].children[0].data,
                                    estatePrice : await obj[key].children[2].children[0].children[0].data,
                                    saleType : await obj[key].children[1].children[0].next.data,
                                    estateConfig : {
                                        beds :  await obj[key].children[2].children[1].children[0].children[0].data,
                                        bath :  await obj[key].children[2].children[1].children[1].children[0].data,
                                        area :  await obj[key].children[2].children[1].children[2].children[0].data
                                    },
                                    estateLocation : {
                                        longitude : await geoObj.longitude,
                                        latitude : await geoObj.latitude
                                    },
                                    estateLink : await obj[key].attribs.href,
                                    estateCoverImgLink : await imgObj[num++].children[2].children[0].attribs.src
                                }
                                console.log(extractedInfo.estateName, imgObj[num].children[2].children[0].attribs.src)
                                await estateData.push(extractedInfo)
                            }
                            catch (e) {
                                console.log("Estate Skipped - ", obj[key].children[0].children[0].data, obj[key].attribs.href)
                                console.log(e)
                            }
                        }
                    }
                    console.log(estateData.length)
                });
            }

            console.log("total - ", estateData.length)
            await page.close()
            await browser.close()
        })
        .catch(function (err) {
            console.log(err)
        });
}

module.exports.getEstateData = getEstateData

Answer ā„–1

Dealing with a comparable problem, I stumbled upon an effective solution here. Hopefully, this resolves your issue as well. I made a slight adjustment to the interval, speeding it up from 100 to 30.

Answer ā„–2

Utilizing the puppeteer-autoscroll-down library, I successfully tackled this challenge with a straightforward implementation. While it's unclear which images you were targeting, this method proved effective for me.

// Adjust viewport and navigate to desired page
await page.setViewport({ width: 1300, height: 1000 });
await page.goto('https://www.zillow.com/vancouver-bc/', { waitUntil: 'load' });

// Scroll to top of the page
await page.evaluate(_ => {
      window.scrollTo(0, 0);
});

// Use puppeteer-autoscroll-down to scroll to the bottom
await scrollPageToBottom(page);

// Retrieve image links
let imageLinks = await page.$$eval('.list-card img', imgLinks => {
    return imgLinks.map((i) => i.src);
});

The array imageLinks contained 40 fully functional links, such as .

I hope this solution is beneficial to you, as I encountered some challenges while solving it myself.

Similar questions

If you have not found the answer to your question or you are interested in this topic, then look at other similar questions below or use the search

Modify the onerror function of the image tag within the onerror function

Here is a way to display images using the img tag: If 1.jpg exists, show 1.jpg. If not, check for 2.jpg and display it if it exists. If neither 1.jpg nor 2.jpg exist, display 3.jpg. <img src="1.jpg" onerror="this.src='2.jpg'; this.oner ...

Vue Eslint Extension

My current project utilizes the eslint vue plugin with specific rules set in place. "rules": { "vue/html-closing-bracket-newline": ["error", { "singleline": "never", "multiline": "always" }], "vue/html-closi ...

Generate a mesh representing an airplane, with vertices specifying various hues

After creating approximately 2500 meshes, I implemented an algorithm to determine the color of each mesh. The algorithm calculates a value based on the distance of each mesh to a "red-start" point, which then determines the final color. However, the curre ...

Tips for maintaining the integrity of blank space within a text node?

When intercepting a paste event and cleaning HTML off of the content using textNodes, I am faced with an issue where all white space is reduced to a single space and new lines are disregarded. For example, pasting: "hello world !" ends up being "h ...

Is it possible to convert ejs to jade?

I'm struggling with the conversion of this ejs code to jade: <h1>Iā€™m planning on counting up to <%= counter %></h1> <p><% for(var i = 1 ; i <= counter ; i++) { %> <%= i %>... <% } %></ ...

Boundaries on Maps: A guide to verifying addresses within a boundary

User provides address on the website. If the address falls within the defined boundary, it is marked as "Eligible". If outside the boundary, labeled as "Ineligible". Are there any existing widgets or code snippets available to achieve this functio ...

An easy way to pass props to a component using useNavigate in React Router

Is it possible to send props directly to a component in React? const goToProjectPage = useNavigate(); useEffect(()=>{ ..... goToProjectPage("/projectpage"); //send props here },[]); ...

Searching for values within an array of objects by iterating through nested arrays to apply a filter

Having trouble returning the testcaseid from an array to this.filteredArray Able to fetch header value and all values of the array when the search word is empty. Seeking assistance with iterating through the testcaseid and header on the search input fiel ...

I must extract all the information from the webpage within the HTML tags, however, I am unsure of which specific tag to target for the data extraction

Here is the example of HTML code that includes a price: <meta itemprop="price" content="121080"> I have created this search code, but I am unsure which tag to use for finding the price: const puppeteer = require('puppeteer&a ...

How can you provide arguments to a mock function?

While using jest for unit testing, I am encountering the following line of code: jest.mock('../../requestBuilder'); In my project folder, there is a __mocks__ subfolder where I have stored my mock requestBuilder.js file. The jest unit test i ...

I'm struggling to make this background show up in a div

Anyone able to help me figure this out? I can't seem to get the achtergrond_homepage.png as a background in rounded corners. Edit: It seems like the gray color is always on top. Could it be controlled in the JavaScript part? This is the CSS: @ch ...

Tips for incorporating Bootstrap classes into a React project, setting the className attribute to an empty string

After setting up Bootstrap and Create-React-App using npm on my local machine, I proceeded to create a new React app. The first component I worked on was counter.jsx: import React, { Component } from 'react'; class Counter extends Component { ...

Can you explain the meaning of arguments[0] and arguments[1] in relation to the executeScript method within the JavascriptExecutor interface in Selenium WebDriver?

When utilizing the executeScript() method from the JavascriptExecutor interface in Selenium WebDriver, what do arguments[0] and arguments[1] signify? Additionally, what is the function of arguments[0] in the following code snippet. javaScriptExecutor.ex ...

The hidden attribute of UIWebView and its interplay with JavaScript

In the webViewDidStartLoad method, I hide the webview. Then a request is made. In the webViewDidFinishLoad method, I use stringByEvaluatingJavaScriptFromString. Finally, the webview is shown again. However, when I run the app, I can still see how the Java ...

Difficulty with Pomodoro clock's canvas ring function not working as expected

Hey everyone, good evening. I've been struggling with a particular section of code for quite some time now and could really use some help. When you check out the constructor at this.drawArc and then look into the prototype, I've printed a couple ...

Developing front-end libraries using the jspm workflow

What is the best way to convert a library written in TypeScript to ES5? While JSPM documentation focuses on web apps (such as with jspm bundle-sfx), the information I've found on Google seems more suited for a web app workflow rather than a library w ...

Struggling with integrating PHP within a JavaScript AJAX function

Here's a button I have: <button id= $id onClick="popup($id)">button</button> I am attempting to use it with an ajax function <script> function popup(id){ alert(" "); document.write(" "); } </script> My goal is to execute P ...

Retrieving string array elements within a Vue.js v-for iteration

Trying to extract string values from an array within a function. The component declared in the template looks like this: <video-preview v-for="video in playlist" vid-id="how-to-retrieve-string -value-from-the-current-iteration-in-playlist"></vi ...

Limiting the number of characters in a textarea using React with TypeScript

Having encountered two issues, I've developed a textarea component that not only allows users to input text but also keeps track of the number of characters they have typed. First Issue: I'm attempting to check if the length of the current input ...

Preventing Repetition in an HTML List using JavaScript

My HTML list is populated with values from JSON, and I have a button on the page. <button onclick="printJsonList()">CLICK</button> This button triggers the following JavaScript function: function printJsonList(){ console.log(ctNameKeep); ...