Currently, I am in the process of developing a website scraper using Javascript (Express) for my personal needs.
The main goal of this script is to extract basic text data from an external source and then convert it into JSON objects. However, I want to organize these objects in pairs inside an array, which is where I require some assistance from fellow developers like you.
At the moment, the output consists of generic JSON objects based on the number of extracted items (typically ranging from 8-16):
{
name: "John Doe",
email: "<a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="afc5c0c7c181cbc0caefc5c0c7c181ccc0c2">[email protected]</a>,
status: "active"
},
{
name: "Jane Doe",
email: "<a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="85efe4ebe0abe1eae0c5efe4ebe0abe6eae8">[email protected]</a>,
status: "inactive"
},
{
name: "Johnny Walker",
email: "<a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="69030601070710471e0805020c1b291e0805020c1b470a0604">[email protected]</a>",
status: "active"
},
{
name: "Jimmy Glenfiddich",
email: "<a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="e48e8d89899dca8388818a828d80808d878ca48388818a80828d80808d878cca878b89">[email protected]</a>
status: "active"
}
The desired outcome should resemble the following structure:
{
"pair-number": 1,
"pair:" [
{
name: "John Doe",
email: "<a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="4a20252224642e252f0a2025222464292527">[email protected]</a>,
status: "active"
},
{
name: "Jane Doe",
email: "<a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="c9a3a8a7ace7ada6ac89a3a8a7ace7aaa6a4">[email protected]</a>,
status: "inactive"
},
]
},
{
"pair-number": 2,
"pair:" [
{
name: "Johnny Walker",
email: "<a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="355f5a5d5b5b4c1b4254595e5047754254595e50471b565a58">[email protected]</a>",
status: "active"
},
{
name: "Jimmy Glenfiddich",
email: "<a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="462c2f2b2b3f68212a2328202f22222f252e06212a232822202f22222f252e6825292b">[email protected]</a>
status: "active"
}
]
}
This is how my `server.js` file looks like:
var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var url = 'http://testurl.com;
var name, email, status;
app.get('/scrape', function(req, res) {
request(url, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
data = {"name": name, "email": email, "status": status };
$('.scrape-class').filter(function() {
var that = $(this);
name = that.find('h5').text();
email = that.find('.email').text();
status = that.find('dl').children().first().text();
data.name = name;
data.email = email;
data.status = status;
console.log(data);
});
}
});
});
app.listen(80, function () {
console.log('Example app listening on port 80!')
})