Currently, I am operating a crawler that gets triggered through an expressjs call.
However, whenever I make the same request again, the crawler runs once more but indicates that all routes have already been completed. I even went to the extent of deleting the './storage' folder
I have gone through the documentation multiple times but still can't figure out how to successfully execute the purgeDefaultStorages() function.
Is there a way for me to completely "reset" the crawler so that there are no cached results?
import express from 'express'
import { PlaywrightCrawler, purgeDefaultStorages, enqueueLinks, Configuration } from 'crawlee';
const app = express();
let crawler
let run = async () => {
const config = new Configuration({ 'persistStorage': false, persistStorage: false }); //already tried with and without quotes.
Configuration.set('persistStorage', false) //added this direct configuration as a test too.
crawler = new PlaywrightCrawler({
launchContext: {
launchOptions: {
headless: true,
},
},
}, config);
crawler.router.addDefaultHandler(async ({ request, page, enqueueLinks }) => {
console.log(`Title of ${request.loadedUrl} ': img: ${request.id}`);
await enqueueLinks({
strategy: 'same-domain'
});
});
await crawler.run(['http://localhost:8088/']);
try {
await config.getStorageClient().purge()
await config.getStorageClient().teardown() //also tried including this just in case.
console.log('purging')
} catch (e) {
console.log(e)
}
}
app.get('/', async (req, res) => {
try {
await run();
res.status(200)
} catch (e) {
res.status(500)
}
});
const PORT = process.env.PORT || 8889;
app.listen(PORT, () => {
console.log(
`The container started successfully and is listening for HTTP requests on ${PORT}`
);