Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
/**
* This example demonstrates how to load pages in headless Chrome / Puppeteer
* over <a href="https://apify.com/docs/proxy">Apify Proxy</a>.
* To make it work, you'll need an Apify Account
* that has access to the proxy.
* The proxy password is available on the <a href="https://my.apify.com/proxy">Proxy</a> page in the app.
* Just set it to the `APIFY_PROXY_PASSWORD` [environment variable](../guides/environmentvariables)
* or run the script using the CLI.
*
* To run this example on the Apify Platform, select the `Node.js 10 + Chrome on Debian (apify/actor-node-chrome)` base image
* on the source tab of your actor configuration.
*/
const Apify = require('apify');
Apify.main(async () => {
// Apify.launchPuppeteer() is similar to Puppeteer's launch() function.
// It accepts the same parameters and returns a preconfigured Puppeteer.Browser instance.
// Moreover, it accepts several additional options, such as useApifyProxy.
const options = {
useApifyProxy: true,
};
const browser = await Apify.launchPuppeteer(options);
console.log('Running Puppeteer script...');
// Proceed with a plain Puppeteer script.
const page = await browser.newPage();
const url = 'https://en.wikipedia.org/wiki/Main_Page';
await page.goto(url);
const title = await page.title();
/**
* This is example how to scrape Hacker News site (https://news.ycombinator.com) using Apify SDK and Puppeteer.
*
* Example uses:
* - Apify PuppeteerCrawler to scrape pages using Puppeteer in parallel
* - Apify Dataset to store data
* - Apify RequestQueue to manage dynamic queue of pending and handled requests
* - Puppeter to controll headless Chrome browser
*/
const Apify = require('apify');
Apify.main(async () => {
// Get queue and enqueue first url.
const requestQueue = await Apify.openRequestQueue();
// Enqueue Start url.
await requestQueue.addRequest(new Apify.Request({ url: 'https://news.ycombinator.com/' }));
// Create crawler.
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
// This page is executed for each request.
// If request failes then it's retried 3 times.
// Parameter page is Puppeteers page object with loaded page.
handlePageFunction: async ({ page, request }) => {
console.log(`Processing ${request.url}...`);
// This is the main Node.js source code file of your actor.
// It is referenced from the "scripts" section of the package.json file,
// so that it can be started by running "npm start".
// Include Apify SDK. For more information, see https://sdk.apify.com/
const Apify = require('apify');
const rp = require('request-promise');
Apify.main(async () => {
// Get input of the actor (here only for demonstration purposes).
// If you'd like to have your input checked and have Apify display
// a user interface for it, add INPUT_SCHEMA.json file to your actor.
// For more information, see https://apify.com/docs/actor/input-schema
const input = await Apify.getInput();
console.log('Input:');
console.dir(input);
if (!input || !input.sources) throw new Error('Input must be a JSON object with the "sources" field!');
const requestList = await Apify.openRequestList('my-request-list', input.sources);
// Create a basic crawler that will use request-promise to download
// web pages from a given list of URLs
const basicCrawler = new Apify.BasicCrawler({
requestList,
* The script extracts the current Bitcoin prices from <a href="https://www.kraken.com/">Kraken.com</a>
* and sends them to your email using the <a href="https://apify.com/apify/send-mail">apify/send-mail</a> actor.
*
* To make the example work, you'll need an <a href="https://my.apify.com/">Apify Account</a>.
* Go to <a href="https://my.apify.com/account#/integrations">Account - Integrations</a> page to obtain your API token
* and set it to the `APIFY_TOKEN` [environment variable](../guides/environmentvariables), or run the script using the CLI.
* If you deploy this actor to the Apify Cloud then you can set up a scheduler for early
* morning.
*
* To run this example on the Apify Platform, select the `Node.js 10 + Chrome on Debian (apify/actor-node-chrome)` base image
* on the source tab of your actor configuration.
*/
const Apify = require('apify');
Apify.main(async () => {
// Launch the web browser.
const browser = await Apify.launchPuppeteer();
console.log('Obtaining email address...');
const user = await Apify.client.users.getUser();
// Load Kraken.com charts and get last traded price of BTC
console.log('Extracting data from kraken.com...');
const page = await browser.newPage();
await page.goto('https://www.kraken.com/charts');
const tradedPricesHtml = await page.$eval('#ticker-top ul', el => el.outerHTML);
// Send prices to your email. For that, you can use an actor we already
// have available on the platform under the name: apify/send-mail.
// The second parameter to the Apify.call() invocation is the actor's
// desired input. You can find the required input parameters by checking
* This is example how to scrape Hacker News site (https://news.ycombinator.com) using Apify SDK
* with Cheerio and Request NPM packages.
*
* Example uses:
* - Apify BasicCrawler to scrape pages in parallel
* - Apify Dataset to store data
* - Apify RequestQueue to manage dynamic queue of pending and handled requests
* - Request NPM package to request html content of website
* - Cherio NPM package to parse html and extract data
*/
const Apify = require('apify');
const rp = require('request-promise');
const cheerio = require('cheerio');
Apify.main(async () => {
// Get queue and enqueue first url.
const requestQueue = await Apify.openRequestQueue();
// Enqueue Start url.
await requestQueue.addRequest(new Apify.Request({ url: 'https://news.ycombinator.com/' }));
// Create crawler.
const crawler = new Apify.BasicCrawler({
requestQueue,
// This page is executed for each request.
// If request failes then it's retried 3 times.
handleRequestFunction: async ({ request }) => {
console.log(`Processing ${request.url}...`);
// Request html of page.
Apify.main(async () => {
// Launch web browser.
const browser = await Apify.launchPuppeteer();
// Load http://goldengatebridge75.org/news/webcam.html and get an IFRAME with the webcam stream
console.log('Opening web page...');
const page = await browser.newPage();
await page.goto('http://goldengatebridge75.org/news/webcam.html');
const iframe = (await page.frames()).pop();
// Get webcam image element handle.
const imageElementHandle = await iframe.$('.VideoColm img');
// Give the webcam image some time to load.
console.log('Waiting for page to load...');
await Apify.utils.sleep(3000);
// Get a screenshot of that image.
const imageBuffer = await imageElementHandle.screenshot();
console.log('Screenshot captured.');
// Save the screenshot as the actor's output. By convention, similarly to "INPUT",
// the actor's output is stored in the default key-value store under the "OUTPUT" key.
await Apify.setValue('OUTPUT', imageBuffer, { contentType: 'image/jpeg' });
console.log('Actor finished.');
});
// a user interface for it, add INPUT_SCHEMA.json file to your actor.
// For more information, see https://apify.com/docs/actor/input-schema
const input = await Apify.getInput();
console.log('Input:');
console.dir(input);
// Do something useful here...
// Save output
const output = {
receivedInput: input,
message: 'Hello sir!',
};
console.log('Output:');
console.dir(output);
await Apify.setValue('OUTPUT', output);
});
if(settingMinMaxPrice && !settingPropertyType){
await setMinMaxPrice(page, input, requestQueue);
}
// If filtering is enabled, enqueue necessary pages.
if(input.useFilters && !filtered){
console.log('enqueuing filtered pages...');
await enqueueLinks(page, requestQueue, '.filterelement', null, 'page', fixUrl('&', input), async link => {
const lText = await getAttribute(link, 'textContent');
return lText + '_' + 0;
});
}
if (enqueuingReady && input.simple) { // If simple output is enough, extract the data.
console.log('extracting data...');
await Apify.setValue('page.html', await page.content(), {contentType: 'text/html'});
await Apify.utils.puppeteer.injectJQuery(page);
const result = await page.evaluate(listPageFunction, input);
console.log('Found ' + result.length + ' results');
if (result.length > 0) {
const toBeAdded = [];
for (const item of result) {
item.url = addUrlParameters(item.url, input);
if (!state.crawled[item.name]) {
toBeAdded.push(item);
state.crawled[item.name] = true;
}
}
if (migrating) { await Apify.setValue('STATE', state); }
if (toBeAdded.length > 0) { await Apify.pushData(toBeAdded); }
}
} else if (enqueuingReady) { // If not, enqueue the detail pages to be extracted.
// We're getting the title, rank and URL of each post on Hacker News.
$posts.forEach(($post) => {
data.push({
title: $post.querySelector('.title a').innerText,
rank: $post.querySelector('.rank').innerText,
href: $post.querySelector('.title a').href,
});
});
return data;
};
const data = await page.$$eval('.athing', pageFunction);
// Store the results to the default dataset.
await Apify.pushData(data);
// Find a link to the next page and enqueue it if it exists.
const infos = await Apify.utils.enqueueLinks({
page,
requestQueue,
selector: '.morelink',
});
if (infos.length === 0) console.log(`${request.url} is the last page!`);
},
console.log('extracting data...');
await Apify.setValue('page.html', await page.content(), {contentType: 'text/html'});
await Apify.utils.puppeteer.injectJQuery(page);
const result = await page.evaluate(listPageFunction, input);
console.log('Found ' + result.length + ' results');
if (result.length > 0) {
const toBeAdded = [];
for (const item of result) {
item.url = addUrlParameters(item.url, input);
if (!state.crawled[item.name]) {
toBeAdded.push(item);
state.crawled[item.name] = true;
}
}
if (migrating) { await Apify.setValue('STATE', state); }
if (toBeAdded.length > 0) { await Apify.pushData(toBeAdded); }
}
} else if (enqueuingReady) { // If not, enqueue the detail pages to be extracted.
console.log('enqueuing detail pages...');
//await enqueueLinks(page, requestQueue, '.hotel_name_link', null, 'detail',
// fixUrl('&', input), (link) => getAttribute(link, 'textContent'));
const urlMod = fixUrl('&', input);
const keyMod = (link) => getAttribute(link, 'textContent');
const prItem = await page.$('.bui-pagination__info');
const pageRange = (await getAttribute(prItem, 'textContent')).match(/\d+/g);
const firstItem = parseInt(pageRange[0]);
const links = await page.$$('.hotel_name_link');
for (let iLink = 0; iLink < links.length; iLink++) {
const link = links[iLink];
const href = await getAttribute(link, 'href');
if (href) {
await requestQueue.addRequest(new Apify.Request({