I've been attempting to extract data from a website using web scraping, but I'm struggling to figure out how to convert the data into a .json format and display it in the console.
const puppeteer = require('puppeteer-extra'), // in python: from puppeteer_extra import puppeteerj
StealthPlugin = require('puppeteer-extra-plugin-stealth'),
AdblockerPlugin = require('puppeteer-extra-plugin-adblocker'),
fs = require("fs");
puppeteer.use(StealthPlugin())
puppeteer.use(AdblockerPlugin())
const pageAddress = specificURL;
const processDataTable = async (tableHandle) => {
const table = {};
const heading = await tableHandle.$("thead tr th.heading-4");
if (heading) {
const title = await heading.getProperty('textContent').then(h => h.jsonValue());
table[title] = [];
const rows = await tableHandle.$$("tbody tr")
for (row of rows) {
const r = [],
head = await row.$("th"),
data = await row.$("td"),
hValue = await head.getProperty('textContent').then(v => v.jsonValue()),
dValue = await data.getProperty('textContent').then(v => v.jsonValue());
if (head && data) {
r.push(hValue);
r.push(dValue);
table[title].push(r);
}
}
}
return table;
}
(async () => {
// start chrome
const browser = await puppeteer.launch({defaultViewport: null});
// open a new tab
const page = await browser.newPage();
// navigate to the website for scraping
await page.goto(pageAddress);
// $: get one element with the specified selector
// $$: get an array of elements that match the selector
// selector ex.: table.table (a <table> with class="table", i.e <table class="table"></table>)
const tableElements = await page.$$("table.table");
const data = [];
for (t of tableElements) {
const tData = await processDataTable(t);
data.push(tData);
}
console.log(JSON.stringify(data));
fs.writeFile('data.json', data);
await browser.close();
})()
Even though the console.log(JSON.stringify(data)) is functioning correctly and displaying the data in the console, unfortunately, the writing function is not performing as expected.