Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Apify.main(async () => {
// Create an instance of the RequestList class that contains a list of URLs to crawl.
// Here we download and parse the list of URLs from an external file.
const requestList = new Apify.RequestList({
sources: [{ requestsFromUrl: CSV_LINK }],
});
await requestList.initialize();
// Create an instance of the CheerioCrawler class - a crawler
// that automatically loads the URLs and parses their HTML using the cheerio library.
const crawler = new Apify.CheerioCrawler({
// Let the crawler fetch URLs from our list.
requestList,
// The crawler downloads and processes the web pages in parallel, with a concurrency
// automatically managed based on the available system memory and CPU (see AutoscaledPool class).
// Here we define some hard limits for the concurrency.
minConcurrency: 10,
maxConcurrency: 50,
// On error, retry each page at most once.
maxRequestRetries: 1,
// Increase the timeout for processing of each page.
handlePageTimeoutSecs: 60,
// This function will be called for each URL to crawl.
systemStatusOptions: {
// Cheerio does a lot of sync operations, so we need to
// give it some time to do its job.
maxEventLoopOverloadedRatio: MAX_EVENT_LOOP_OVERLOADED_RATIO,
},
},
requestOptions: {
headers: {},
},
};
if (this.cookieJar) {
options.requestOptions.cookieJar = this.cookieJar;
}
this.crawler = new Apify.CheerioCrawler(options);
return this.crawler;
}