What is the best way to extract information from a button that does not provide a response?

Question

What is the best way to extract information from a button that does not provide a response?

Trying to extract data from the site . There's a "next" button that needs to be clicked in order to scrape the contents. However, I'm facing difficulty in identifying the correct xpath or css selector for this button which is preventing me from progressing with the scraping process. Any assistance would be greatly appreciated as I'm currently stuck at this point. Below is the code snippet I have been working with, but it's not yielding the desired outcomes.

# -*- coding: utf-8 -*-

import scrapy import scrapy_selenium from scrapy_selenium import SeleniumRequest

class VisionSpider(scrapy.Spider): name = 'vision'

def start_requests(self):
    yield SeleniumRequest(
        url= 'https://tonaton.com',
        wait_time=3,
        screenshot=True,
        callback=self.parse
    )


def parse(self, response): 
    businesses = response.xpath("//a[@class='link--1t8hM gtm-home-category-link-click']")
    for business in businesses:
        link = business.xpath(".//@href").get()
        category = business.xpath(".//div[2]/p/text()").get()

        yield response.follow(url=link, callback=self.parse_business, meta={'business_category': category})


def parse_business(self, response):
    
    category = response.request.meta['business_category']
    rows = response.xpath("//a[@class='card-link--3ssYv gtm-ad-item']")
    for row in rows:
        new_link = row.xpath(".//@href").get()

        yield response.follow(url=new_link, callback=self.next_parse, meta={'business_category': category})

    next_page = response.xpath("//div[@class = 'action-button--1O8tU']")
    if next_page:
        button = next_page.click()
        yield SeleniumRequest(
            url=button,
            wait_time=3,
            callback=self.parse
        )



def next_parse(self, response):
    category = response.request.meta['business_category']
    lines = response.xpath("//a[@class='member-link--IzDly gtm-visit-shop']")
    for line in lines:
        next_link = line.xpath(".//@href").get()

        yield response.follow(url=next_link, callback=self.another_parse, meta={'business_category': category})

def another_parse(self, response):
    category = response.request.meta['business_category']
    button = response.xpath("//button[@class = 'contact-section--1qlvP gtm-show-number']").click()
    
    yield response.follow(url=button, callback=self.new_parse, meta={'business_category': category})


def new_parse(self, response):
    category = response.request.meta['business_category']
    times = response.xpath("//div[@class='info-container--3pMhK']")
    for time in times:
        name = time.xpath(".//div/span/text()").get()
        location = time.xpath(".//div/div/div/span/text()").get()
        phone = time.xpath(".//div[3]/div/button/div[2]/div/text()").get()

        yield {
            'business_category': category,
            'business_name': name,
            'phone': phone,
            'location': location
        }

javascript selenium scrapy scrapinghub scrapy-selenium

Answer 1

Answer №1

Despite attempting to implement changes, I am still unable to get the pagination functioning properly. Additionally, the process of clicking the call button for scraping purposes is taking longer than expected. Is there a method to enhance the speed of this operation?

class VisionSpider(scrapy.Spider):
    name = 'vision'
    main_domains = ['tonaton.com']
    start_urls =['https://tonaton.com']

def parse(self, response):   
    businesses = response.xpath("//a[@class='link--1t8hM gtm-home-category-link-click'][1]")
    for business in businesses:
        link = business.xpath(".//@href").get()
        category = business.xpath(".//div[2]/p/text()").get()

        yield response.follow(url=link, callback=self.parse_business, meta={'business_category': category})


def parse_business(self, response):
    category = response.request.meta['business_category']
    rows = response.xpath("//a[@class='card-link--3ssYv gtm-ad-item']")
    for row in rows:
        new_link = row.xpath(".//@href").get()
        if new_link:

            yield response.follow(url=new_link, callback=self.new_parse, meta={'business_category': category, 'newlink':new_link})

    chrome_options = Options()
    chrome_options.add_argument("--headless")

    chrome_path = which("chromedriver")
    driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_path)
    driver.get(response.url)
    driver.maximize_window

    next_page = wait(driver, 300).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//div[@class='icon--3D09z extra-small--_AIuZ arrow-right--17oRn']"))) 
    if  next_page:
        next_page.click()

        yield SeleniumRequest(callback=self.parse_business)
    
    driver.close()



def new_parse(self, response):
    category = response.request.meta['business_category']
    chrome_options = Options()
    chrome_options.add_argument("--headless")
# options=chrome_options
    chrome_path = which("chromedriver")  
    driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_path)
    driver.get(response.url)
    driver.maximize_window
    category = response.request.meta['business_category']

    call_button = wait(driver, 500).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='call-button--3uvWj']")))
    call_button.click()
    
    html = driver.page_source
    resp = Selector(text=html)

    driver.close()

    contacts = resp.xpath("//div[@class='call-button--3uvWj']/div[1]")
    for contact in contacts:
        phone = contact.xpath(".//text()").get()
    times = resp.xpath("//div[@class='details-section--2ggRy']")
    for time in times:
        name = time.xpath(".//div[2]/div/div[2]/div/div/div/div/div/div/div/div/div/text()").get()
        if name is None:
            name =time.xpath(".//div[2]/div/div[2]/div/div/div/div/div/div/div/div/div/text()").get()

        location = time.xpath(".//div/div/div/span/a/span/text()[1]").get()
        region = time.xpath(".//div/div/div/span/a[2]/span/text()").get()

        yield {
            'business_category': category,
            'business_name': name,
            'phone': phone,
            'region':region,
            'location': location
        }

Answer 2

Despite attempting to implement changes, I am still unable to get the pagination functioning properly. Additionally, the process of clicking the call button for scraping purposes is taking longer than expected. Is there a method to enhance the speed of this operation?

class VisionSpider(scrapy.Spider):
    name = 'vision'
    main_domains = ['tonaton.com']
    start_urls =['https://tonaton.com']

def parse(self, response):   
    businesses = response.xpath("//a[@class='link--1t8hM gtm-home-category-link-click'][1]")
    for business in businesses:
        link = business.xpath(".//@href").get()
        category = business.xpath(".//div[2]/p/text()").get()

        yield response.follow(url=link, callback=self.parse_business, meta={'business_category': category})


def parse_business(self, response):
    category = response.request.meta['business_category']
    rows = response.xpath("//a[@class='card-link--3ssYv gtm-ad-item']")
    for row in rows:
        new_link = row.xpath(".//@href").get()
        if new_link:

            yield response.follow(url=new_link, callback=self.new_parse, meta={'business_category': category, 'newlink':new_link})

    chrome_options = Options()
    chrome_options.add_argument("--headless")

    chrome_path = which("chromedriver")
    driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_path)
    driver.get(response.url)
    driver.maximize_window

    next_page = wait(driver, 300).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//div[@class='icon--3D09z extra-small--_AIuZ arrow-right--17oRn']"))) 
    if  next_page:
        next_page.click()

        yield SeleniumRequest(callback=self.parse_business)
    
    driver.close()



def new_parse(self, response):
    category = response.request.meta['business_category']
    chrome_options = Options()
    chrome_options.add_argument("--headless")
# options=chrome_options
    chrome_path = which("chromedriver")  
    driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_path)
    driver.get(response.url)
    driver.maximize_window
    category = response.request.meta['business_category']

    call_button = wait(driver, 500).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='call-button--3uvWj']")))
    call_button.click()
    
    html = driver.page_source
    resp = Selector(text=html)

    driver.close()

    contacts = resp.xpath("//div[@class='call-button--3uvWj']/div[1]")
    for contact in contacts:
        phone = contact.xpath(".//text()").get()
    times = resp.xpath("//div[@class='details-section--2ggRy']")
    for time in times:
        name = time.xpath(".//div[2]/div/div[2]/div/div/div/div/div/div/div/div/div/text()").get()
        if name is None:
            name =time.xpath(".//div[2]/div/div[2]/div/div/div/div/div/div/div/div/div/text()").get()

        location = time.xpath(".//div/div/div/span/a/span/text()[1]").get()
        region = time.xpath(".//div/div/div/span/a[2]/span/text()").get()

        yield {
            'business_category': category,
            'business_name': name,
            'phone': phone,
            'region':region,
            'location': location
        }

What is the best way to extract information from a button that does not provide a response?

Answer №1

Similar questions

Sending a `refresh` to a Context

Issue an alert and refresh the webpage when a file extension upload script is detected

Tips for transferring directive parameters to another directive

Incorporate and interpret a custom JSON object within my Shopify Liquid theme

Issue NG0203 encountered during material import attempt

React State not refreshing

Managing arrayBuffer in hapi.js: A Comprehensive Guide

Cloning a repository using the GitHub API via client-side Javascript and OAuth.io

Leverage i18next in an offline setting without the need for a web

"Troubleshooting: jQuery Find function not functioning correctly with HTML template

Load charts.js synchronously into a div using XMLHttpRequest

Saving videos for offline viewing in a Vue Progressive Web App with Vue.js 3

Is it possible to integrate iOS automation and web automation into a unified script utilizing selenium?

Can Webdrivermanager be used with RemoteWebDriver in Selenium Grid?

JavaScript Astro file not triggering window.onload event

Python Selenium is having trouble finding the elements

Adjust the background color of the unordered list when it corresponds to the current

Transform an array into a hierarchical JSON structure within an Angular Material tree

Send information from the textbox

Extend the center of the image horizontally