When web scraping, the website returns a string like this on get request:
jQuery18305426675335038453_1429531451051({"d":[{"__metadata":"cool"}]})
The complete code snippet is provided below:
var baseUrl = "http://SOMEURL.COM?spatialFilter=nearby(52.47952651977539,-1.911009430885315,400)&$select=*&$top=200&$format=json&key=AjF8l9J6TH-WM5tkfFYdYE8NVUx9SFe4ya9aBaxKFFPBImvFWWHPOsRMSBesWblU&jsonp=jQuery18305426675335038453_1429531451051&_=1429532300821%20HTTP/1.1";
var casper = require('casper').create({
verbose: false,
logLevel: 'debug',
pageSettings: {
loadImages: false,
loadPlugins: false
}
});
var fs = require('fs'),
shopInfo,
savePath,
date = new Date(),
secondsNow = date.getSeconds(),
day = date.getDate(),
minute = date.getMinutes();
month = date.getMonth() + 1,
fname = 'virginmedia-'+month+'-'+day+'-'+minute+'-'+secondsNow+'.txt';
function saveToFile(finalData) {
savePath = fs.pathJoin(fs.workingDirectory,
'output',fname);
fs.write(savePath, finalData, 'w');
}
casper.start(baseUrl, {
method: 'get',
headers: {
'Accept': 'application/json'
}});
casper.then(function getData(){
var rawData = this.getPageContent();
shopInfo = rawData;
shopInfo = shopInfo.replace("jQuery18305426675335038453_1429531451051(",'');
shopInfo = shopInfo.replace(/\)$/,'');
shopInfo = JSON.parse(shopInfo);
var resultPack = shopInfo.d.results;
var finalData = resultPack.map(function(val){
return [
val.Latitude,
val.Longitude,
val.EntityStoreName
];
});
saveToFile(JSON.stringify(finalData));
casper.echo("\n Hello! I just returned " + finalData.length
+ " shops");
});
casper.run();
In essence, it's valid json inside a function call structure. However, I specifically need the JSON part extracted.
If within a browser environment, I could easily create a function with the same name that would return its own parameters:
function jQuery18305426675335038453_1429531451051() {
return arguments[0];
}
Unfortunately, in casperjs, this approach doesn't seem to work. So, my workaround was using regex to extract the JSON string:
shopInfo = shopInfo.replace("jQuery18305426675335038453_1429531451051(",'');
shopInfo = shopInfo.replace(/\)$/,'');
I'm curious if there is a better way to achieve this?
Edit 1 : According to comments, it appears that it's JSONP rather than JSON which simplifies things. I found a solution after researching about JSONP here.
Edit 2 : Another suggestion from the comments indicated that by modifying the request, the website can directly return proper JSON data!