Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Apify.main(async () => {
// Read the actor input configuration containing the URLs for the screenshot.
// By convention, the input is present in the actor's default key-value store under the "INPUT" key.
const input = await Apify.getInput();
if (!input) throw new Error('Have you passed the correct INPUT ?');
const { sources } = input;
const requestList = new Apify.RequestList({ sources });
await requestList.initialize();
const crawler = new Apify.PuppeteerCrawler({
requestList,
handlePageFunction: async ({ page, request }) => {
console.log(`Processing ${request.url}...`);
// This is a Puppeteer function that takes a screenshot of the page and returns its buffer.
const screenshotBuffer = await page.screenshot();
// The record key may only include the following characters: a-zA-Z0-9!-_.'()
const key = request.url.replace(/[:/]/g, '_');
// Save the screenshot. Choosing the right content type will automatically
// assign the local file the right extension, in this case .png.
// The screenshots will be stored in ./apify_storage/key_value_stores/default/
await Apify.setValue(key, screenshotBuffer, { contentType: 'image/png' });
console.log(`Screenshot of ${request.url} saved.`);
},
// a user interface for it, add INPUT_SCHEMA.json file to your actor.
// For more information, see https://apify.com/docs/actor/input-schema
const input = await Apify.getInput();
console.log('Input:');
console.dir(input);
// Open a request queue and add a start URL to it
const requestQueue = await Apify.openRequestQueue();
await requestQueue.addRequest({ url: 'https://www.iana.org/' });
// Define a pattern of URLs that the crawler should visit
const pseudoUrls = [new Apify.PseudoUrl('https://www.iana.org/[.*]')];
// Create a crawler that will use headless Chrome / Puppeteer to extract data
// from pages and recursively add links to newly-found pages
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
// This function is called for every page the crawler visits
handlePageFunction: async ({ request, page }) => {
const title = await page.title();
console.log(`Title of ${request.url}: ${title}`);
await Apify.pushData({
title,
'#debug': Apify.utils.createRequestDebugInfo(request),
});
await Apify.utils.enqueueLinks({ page, selector: 'a', pseudoUrls, requestQueue });
},
// This function is called for every page the crawler failed to load
// or for which the handlePageFunction() throws at least "maxRequestRetries"-times
handleFailedRequestFunction: async ({ request }) => {
}
// Temporary fix, make UI proxy input compatible
if (input.proxyConfig && input.proxyConfig.apifyProxyGroups) {
for (let i = 0; i < input.proxyConfig.apifyProxyGroups.length; i++) {
const gSpl = input.proxyConfig.apifyProxyGroups[i].split('-');
const nGroup = gSpl[gSpl.length - 1];
input.proxyConfig.apifyProxyGroups[i] = nGroup;
}
}
// Simulated browser chache
const cache = {};
// Main crawler variable.
const crawler = new Apify.PuppeteerCrawler({
requestList,
requestQueue,
handlePageTimeoutSecs: 120,
// Browser instance creation.
launchPuppeteerFunction: () => {
if (!input.testProxy) {
return Apify.launchPuppeteer(input.proxyConfig || {});
}
return getWorkingBrowser(startUrl, input);
},
// Main page handling function.
handlePageFunction: async ({ page, request, puppeteerPool }) => {
Apify.main(async () => {
// Get queue and enqueue first url.
const requestQueue = await Apify.openRequestQueue();
// Enqueue Start url.
await requestQueue.addRequest(new Apify.Request({ url: 'https://news.ycombinator.com/' }));
// Create crawler.
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
// This page is executed for each request.
// If request failes then it's retried 3 times.
// Parameter page is Puppeteers page object with loaded page.
handlePageFunction: async ({ page, request }) => {
console.log(`Processing ${request.url}...`);
// Extract all posts.
const pageFunction = ($posts) => {
const data = [];
$posts.forEach(($post) => {
data.push({
title: $post.querySelector('.title a').innerText,
rank: $post.querySelector('.rank').innerText,
Apify.main(async () => {
// Apify.openRequestQueue() is a factory to get a preconfigured RequestQueue instance.
// We add our first request to it - the initial page the crawler will visit.
const requestQueue = await Apify.openRequestQueue();
await requestQueue.addRequest({ url: 'https://news.ycombinator.com/' });
// Create an instance of the PuppeteerCrawler class - a crawler
// that automatically loads the URLs in headless Chrome / Puppeteer.
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
// Here you can set options that are passed to the Apify.launchPuppeteer() function.
// For example, you can set "slowMo" to slow down Puppeteer operations to simplify debugging
launchPuppeteerOptions: { slowMo: 500 },
// Stop crawling after several pages
maxRequestsPerCrawl: 10,
// This function will be called for each URL to crawl.
// Here you can write the Puppeteer scripts you are familiar with,
// with the exception that browsers and pages are automatically managed by the Apify SDK.
// The function accepts a single parameter, which is an object with the following fields:
// - request: an instance of the Request class with information such as URL and HTTP method
// - page: Puppeteer's Page object (see https://pptr.dev/#show=api-class-page)
handlePageFunction: async ({ request, page }) => {
Apify.main(async () => {
// Apify.openRequestQueue() is a factory to get a preconfigured RequestQueue instance.
// We add our first request to it - the initial page the crawler will visit.
const requestQueue = await Apify.openRequestQueue();
await requestQueue.addRequest({ url: 'https://news.ycombinator.com/' });
// Create an instance of the PuppeteerCrawler class - a crawler
// that automatically loads the URLs in headless Chrome / Puppeteer.
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
// Here you can set options that are passed to the Apify.launchPuppeteer() function.
launchPuppeteerOptions: {
// For example, by adding "slowMo" you'll slow down Puppeteer operations to simplify debugging
// slowMo: 500,
},
// Stop crawling after several pages
maxRequestsPerCrawl: 10,
// This function will be called for each URL to crawl.
// Here you can write the Puppeteer scripts you are familiar with,
// with the exception that browsers and pages are automatically managed by the Apify SDK.
// The function accepts a single parameter, which is an object with the following fields:
// - request: an instance of the Request class with information such as URL and HTTP method
Apify.main(async () => {
const requestList = new Apify.RequestList({
sources: [{ requestsFromUrl: 'https://edition.cnn.com/sitemaps/cnn/news.xml' }],
});
await requestList.initialize();
const crawler = new Apify.PuppeteerCrawler({
requestList,
handlePageFunction: async ({ page, request }) => {
console.log(`Processing ${request.url}...`);
await Apify.pushData({
url: request.url,
title: await page.title(),
html: await page.content(),
});
},
});
await crawler.run();
console.log('Done.');
});
puppeteerPoolOptions: {
useLiveView: true,
recycleDiskCache: true,
},
launchPuppeteerOptions: {
...(_.omit(this.input.proxyConfiguration, 'proxyUrls')),
ignoreHTTPSErrors: this.input.ignoreSslErrors,
defaultViewport: DEFAULT_VIEWPORT,
devtools: this.devtools,
useChrome: this.input.useChrome,
stealth: this.input.useStealth,
args,
},
};
this.crawler = new Apify.PuppeteerCrawler(options);
return this.crawler;
}
Apify.main(async () => {
const requestQueue = await Apify.openRequestQueue();
await requestQueue.addRequest({ url: 'https://www.iana.org/' });
const pseudoUrls = [new Apify.PseudoUrl('https://www.iana.org/[.*]')];
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
handlePageFunction: async ({ request, page }) => {
const title = await page.title();
console.log(`Title of ${request.url}: ${title}`);
await Apify.utils.enqueueLinks({ page, selector: 'a', pseudoUrls, requestQueue });
},
maxRequestsPerCrawl: 100,
maxConcurrency: 10,
});
await crawler.run();
});
puppeteerPoolOptions: {
useLiveView: true,
recycleDiskCache: true,
},
launchPuppeteerOptions: {
...(_.omit(this.input.proxyConfiguration, 'proxyUrls')),
ignoreHTTPSErrors: this.input.ignoreSslErrors,
defaultViewport: DEFAULT_VIEWPORT,
devtools: this.devtools,
useChrome: this.input.useChrome,
stealth: this.input.useStealth,
args,
},
};
this.crawler = new Apify.PuppeteerCrawler(options);
return this.crawler;
}