Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
// check if attribute is an Array
if (!Array.isArray(input.startUrls)) {
throw new Error('INPUT.startUrls must an array!');
}
// convert any inconsistencies to correct format
for (let i = 0; i < input.startUrls.length; i++) {
let request = input.startUrls[i];
if (typeof request === 'string') { request = { url: request }; }
if ((!request.userData || !request.userData.label !== 'detail') && request.url.indexOf('/hotel/') > -1) {
request.userData = { label: 'detail' };
}
request.url = addUrlParameters(request.url, input);
input.startUrls[i] = request;
}
// create RequestList and reference startUrl
requestList = new Apify.RequestList({ sources: input.startUrls });
startUrl = addUrlParameters('https://www.booking.com/searchresults.html?dest_type=city;ss=paris&order=bayesian_review_score', input);
await requestList.initialize();
} else {
// Create startURL based on provided INPUT.
const dType = input.destType || 'city';
const query = encodeURIComponent(input.search);
startUrl = `https://www.booking.com/searchresults.html?dest_type=${dType};ss=${query}&order=${sortBy}`;
startUrl = addUrlParameters(startUrl, input);
// Enqueue all pagination pages.
startUrl += '&rows=20';
console.log(`startUrl: ${startUrl}`);
await requestQueue.addRequest(new Apify.Request({url: startUrl, userData: {label: 'start'}}));
if(!input.useFilters && input.propertyType == 'none' && input.minMaxPrice == 'none' && input.maxPages){
for(let i = 1; i <= input.maxPages; i++){
await requestQueue.addRequest(new Apify.Request({
Apify.main(async () => {
// Read the actor input configuration containing the URLs for the screenshot.
// By convention, the input is present in the actor's default key-value store under the "INPUT" key.
const input = await Apify.getInput();
if (!input) throw new Error('Have you passed the correct INPUT ?');
const { sources } = input;
const requestList = new Apify.RequestList({ sources });
await requestList.initialize();
const crawler = new Apify.PuppeteerCrawler({
requestList,
handlePageFunction: async ({ page, request }) => {
console.log(`Processing ${request.url}...`);
// This is a Puppeteer function that takes a screenshot of the page and returns its buffer.
const screenshotBuffer = await page.screenshot();
// The record key may only include the following characters: a-zA-Z0-9!-_.'()
const key = request.url.replace(/[:/]/g, '_');
// Save the screenshot. Choosing the right content type will automatically
// assign the local file the right extension, in this case .png.
// The screenshots will be stored in ./apify_storage/key_value_stores/default/
Apify.main(async () => {
// Create an instance of the RequestList class that contains a list of URLs to crawl.
// Here we download and parse the list of URLs from an external file.
const requestList = new Apify.RequestList({
sources: [{ requestsFromUrl: CSV_LINK }],
});
await requestList.initialize();
// Create an instance of the CheerioCrawler class - a crawler
// that automatically loads the URLs and parses their HTML using the cheerio library.
const crawler = new Apify.CheerioCrawler({
// Let the crawler fetch URLs from our list.
requestList,
// The crawler downloads and processes the web pages in parallel, with a concurrency
// automatically managed based on the available system memory and CPU (see AutoscaledPool class).
// Here we define some hard limits for the concurrency.
minConcurrency: 10,
maxConcurrency: 50,
Apify.main(async () => {
// Create a request list.
const requestList = new Apify.RequestList({
sources: [
{ url: 'http://www.example.com' },
{ url: 'http://www.example.com/?page=2' },
{ url: 'http://www.example.com/?page=3' },
{ url: 'http://www.example.com/?page=4' },
{ url: 'http://www.example.com/?page=5' },
],
});
await requestList.initialize();
const crawler = new Apify.BasicCrawler({
requestList,
// This page is executed for each request.
// If request failes then it's retried 3 times.
Apify.main(async () => {
// Create a request list.
const requestList = new Apify.RequestList({
sources: [
{ url: 'http://www.example.com' },
{ url: 'http://www.some-nonexisting-domain.com' },
],
});
await requestList.initialize();
const crawler = new Apify.PuppeteerCrawler({
requestList,
disableProxy: true,
// This page is executed for each request.
// If request failes then it's retried 3 times.
// Parameter page is Puppeteers page object with loaded page.
handlePageFunction: async ({ page, request }) => {
Apify.main(async () => {
// Create and initialize an instance of the RequestList class that contains
// a list of URLs to crawl. Here we use just a few hard-coded URLs.
const requestList = new Apify.RequestList({
sources: [
{ url: 'http://www.google.com/' },
{ url: 'http://www.example.com/' },
{ url: 'http://www.bing.com/' },
{ url: 'http://www.wikipedia.com/' },
],
});
await requestList.initialize();
// Create a BasicCrawler - the simplest crawler that enables
// users to implement the crawling logic themselves.
const crawler = new Apify.BasicCrawler({
// Let the crawler fetch URLs from our list.
requestList,
Apify.main(async () => {
const requestList = new Apify.RequestList({
sources: [{ requestsFromUrl: 'https://edition.cnn.com/sitemaps/cnn/news.xml' }],
});
await requestList.initialize();
const crawler = new Apify.PuppeteerCrawler({
requestList,
handlePageFunction: async ({ page, request }) => {
console.log(`Processing ${request.url}...`);
await Apify.pushData({
url: request.url,
title: await page.title(),
html: await page.content(),
});
},
});