Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Apify.main(async () => {
// Create and initialize an instance of the RequestList class that contains
// a list of URLs to crawl. Here we use just a few hard-coded URLs.
const requestList = new Apify.RequestList({
sources: [
{ url: 'http://www.google.com/' },
{ url: 'http://www.example.com/' },
{ url: 'http://www.bing.com/' },
{ url: 'http://www.wikipedia.com/' },
],
});
await requestList.initialize();
// Create a BasicCrawler - the simplest crawler that enables
// users to implement the crawling logic themselves.
const crawler = new Apify.BasicCrawler({
// Let the crawler fetch URLs from our list.
requestList,
// This function will be called for each URL to crawl.
// The 'request' option is an instance of the Request class, which contains
// information such as URL and HTTP method, as supplied by the RequestList.
handleRequestFunction: async ({ request }) => {
console.log(`Processing ${request.url}...`);
// Fetch the page HTML
const html = await requestPromise(request.url);
// Store the HTML and URL to the default dataset.
await Apify.pushData({
url: request.url,
Apify.main(async () => {
// Create a request list.
const requestList = new Apify.RequestList({
sources: [
{ url: 'http://www.example.com' },
{ url: 'http://www.example.com/?page=2' },
{ url: 'http://www.example.com/?page=3' },
{ url: 'http://www.example.com/?page=4' },
{ url: 'http://www.example.com/?page=5' },
],
});
await requestList.initialize();
const crawler = new Apify.BasicCrawler({
requestList,
// This page is executed for each request.
// If request failes then it's retried 3 times.
handleRequestFunction: async ({ request }) => {
const pageHtml = await rp(request.url);
console.log(`Request ${request.url} succeeded with return html of length ${pageHtml.length}`);
await Apify.pushData({
url: request.url,
html: pageHtml,
errors: null,
})
},
Apify.main(async () => {
// Get queue and enqueue first url.
const requestQueue = await Apify.openRequestQueue();
// Enqueue Start url.
await requestQueue.addRequest(new Apify.Request({ url: 'https://news.ycombinator.com/' }));
// Create crawler.
const crawler = new Apify.BasicCrawler({
requestQueue,
// This page is executed for each request.
// If request failes then it's retried 3 times.
handleRequestFunction: async ({ request }) => {
console.log(`Processing ${request.url}...`);
// Request html of page.
const html = await rp(request.url);
// Extract data with cheerio.
const data = [];
const $ = cheerio.load(html);
$('.athing').each((index, el) => {
data.push({
title: $(el).find('.title a').text(),
Apify.main(async () => {
// Get input of the actor (here only for demonstration purposes).
// If you'd like to have your input checked and have Apify display
// a user interface for it, add INPUT_SCHEMA.json file to your actor.
// For more information, see https://apify.com/docs/actor/input-schema
const input = await Apify.getInput();
console.log('Input:');
console.dir(input);
if (!input || !input.sources) throw new Error('Input must be a JSON object with the "sources" field!');
const requestList = await Apify.openRequestList('my-request-list', input.sources);
// Create a basic crawler that will use request-promise to download
// web pages from a given list of URLs
const basicCrawler = new Apify.BasicCrawler({
requestList,
handleRequestFunction: async ({ request }) => {
await Apify.pushData({
request,
finishedAt: new Date(),
html: await rp(request.url),
'#debug': Apify.utils.createRequestDebugInfo(request),
});
},
handleFailedRequestFunction: async ({ request }) => {
await Apify.pushData({
'#isFailed': true,
'#debug': Apify.utils.createRequestDebugInfo(request),
});
},