Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
/**
* This example demonstrates how to load pages in headless Chrome / Puppeteer
* over <a href="https://apify.com/docs/proxy">Apify Proxy</a>.
* To make it work, you'll need an Apify Account
* that has access to the proxy.
* The proxy password is available on the <a href="https://my.apify.com/proxy">Proxy</a> page in the app.
* Just set it to the `APIFY_PROXY_PASSWORD` [environment variable](../guides/environmentvariables)
* or run the script using the CLI.
*
* To run this example on the Apify Platform, select the `Node.js 10 + Chrome on Debian (apify/actor-node-chrome)` base image
* on the source tab of your actor configuration.
*/
const Apify = require('apify');
Apify.main(async () => {
// Apify.launchPuppeteer() is similar to Puppeteer's launch() function.
// It accepts the same parameters and returns a preconfigured Puppeteer.Browser instance.
// Moreover, it accepts several additional options, such as useApifyProxy.
const options = {
useApifyProxy: true,
};
const browser = await Apify.launchPuppeteer(options);
console.log('Running Puppeteer script...');
// Proceed with a plain Puppeteer script.
const page = await browser.newPage();
const url = 'https://en.wikipedia.org/wiki/Main_Page';
await page.goto(url);
const title = await page.title();
/**
* This is example how to scrape Hacker News site (https://news.ycombinator.com) using Apify SDK and Puppeteer.
*
* Example uses:
* - Apify PuppeteerCrawler to scrape pages using Puppeteer in parallel
* - Apify Dataset to store data
* - Apify RequestQueue to manage dynamic queue of pending and handled requests
* - Puppeter to controll headless Chrome browser
*/
const Apify = require('apify');
Apify.main(async () => {
// Get queue and enqueue first url.
const requestQueue = await Apify.openRequestQueue();
// Enqueue Start url.
await requestQueue.addRequest(new Apify.Request({ url: 'https://news.ycombinator.com/' }));
// Create crawler.
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
// This page is executed for each request.
// If request failes then it's retried 3 times.
// Parameter page is Puppeteers page object with loaded page.
handlePageFunction: async ({ page, request }) => {
console.log(`Processing ${request.url}...`);
// This is the main Node.js source code file of your actor.
// It is referenced from the "scripts" section of the package.json file,
// so that it can be started by running "npm start".
// Include Apify SDK. For more information, see https://sdk.apify.com/
const Apify = require('apify');
const rp = require('request-promise');
Apify.main(async () => {
// Get input of the actor (here only for demonstration purposes).
// If you'd like to have your input checked and have Apify display
// a user interface for it, add INPUT_SCHEMA.json file to your actor.
// For more information, see https://apify.com/docs/actor/input-schema
const input = await Apify.getInput();
console.log('Input:');
console.dir(input);
if (!input || !input.sources) throw new Error('Input must be a JSON object with the "sources" field!');
const requestList = await Apify.openRequestList('my-request-list', input.sources);
// Create a basic crawler that will use request-promise to download
// web pages from a given list of URLs
const basicCrawler = new Apify.BasicCrawler({
requestList,
* The script extracts the current Bitcoin prices from <a href="https://www.kraken.com/">Kraken.com</a>
* and sends them to your email using the <a href="https://apify.com/apify/send-mail">apify/send-mail</a> actor.
*
* To make the example work, you'll need an <a href="https://my.apify.com/">Apify Account</a>.
* Go to <a href="https://my.apify.com/account#/integrations">Account - Integrations</a> page to obtain your API token
* and set it to the `APIFY_TOKEN` [environment variable](../guides/environmentvariables), or run the script using the CLI.
* If you deploy this actor to the Apify Cloud then you can set up a scheduler for early
* morning.
*
* To run this example on the Apify Platform, select the `Node.js 10 + Chrome on Debian (apify/actor-node-chrome)` base image
* on the source tab of your actor configuration.
*/
const Apify = require('apify');
Apify.main(async () => {
// Launch the web browser.
const browser = await Apify.launchPuppeteer();
console.log('Obtaining email address...');
const user = await Apify.client.users.getUser();
// Load Kraken.com charts and get last traded price of BTC
console.log('Extracting data from kraken.com...');
const page = await browser.newPage();
await page.goto('https://www.kraken.com/charts');
const tradedPricesHtml = await page.$eval('#ticker-top ul', el => el.outerHTML);
// Send prices to your email. For that, you can use an actor we already
// have available on the platform under the name: apify/send-mail.
// The second parameter to the Apify.call() invocation is the actor's
// desired input. You can find the required input parameters by checking
* This is example how to scrape Hacker News site (https://news.ycombinator.com) using Apify SDK
* with Cheerio and Request NPM packages.
*
* Example uses:
* - Apify BasicCrawler to scrape pages in parallel
* - Apify Dataset to store data
* - Apify RequestQueue to manage dynamic queue of pending and handled requests
* - Request NPM package to request html content of website
* - Cherio NPM package to parse html and extract data
*/
const Apify = require('apify');
const rp = require('request-promise');
const cheerio = require('cheerio');
Apify.main(async () => {
// Get queue and enqueue first url.
const requestQueue = await Apify.openRequestQueue();
// Enqueue Start url.
await requestQueue.addRequest(new Apify.Request({ url: 'https://news.ycombinator.com/' }));
// Create crawler.
const crawler = new Apify.BasicCrawler({
requestQueue,
// This page is executed for each request.
// If request failes then it's retried 3 times.
handleRequestFunction: async ({ request }) => {
console.log(`Processing ${request.url}...`);
// Request html of page.
const Apify = require('apify');
const CrawlerSetup = require('./crawler_setup');
const { utils: { log } } = Apify;
Apify.main(async () => {
log.debug('Reading INPUT.');
const input = await Apify.getInput();
if (!input) throw new Error('INPUT cannot be empty!');
// Get crawler setup and startup options.
log.info('Configuring Cheerio Scraper.');
const setup = new CrawlerSetup(input);
const crawler = await setup.createCrawler();
log.info('Configuration completed. Starting the scrape.');
await crawler.run();
log.info('Cheerio Scraper finished.');
});
const Apify = require('apify');
const CrawlerSetup = require('./crawler_setup');
const { utils: { log } } = Apify;
Apify.main(async () => {
log.debug('Reading INPUT.');
const input = await Apify.getInput();
if (!input) throw new Error('INPUT cannot be empty!');
// Get crawler setup and startup options.
log.info('Configuring Puppeteer Scraper.');
const setup = new CrawlerSetup(input);
const crawler = await setup.createCrawler();
log.info('Configuration completed. Starting the scrape.');
await crawler.run();
log.info('Puppeteer Scraper finished.');
});
const Apify = require('apify');
const CrawlerSetup = require('./crawler_setup');
const { utils: { log } } = Apify;
log.logJson = false;
Apify.main(async () => {
log.debug('Reading INPUT.');
const input = await Apify.getInput();
if (!input) throw new Error('INPUT cannot be empty!');
// Get crawler setup and startup options.
log.info('Configuring Web Scraper.');
const setup = new CrawlerSetup(input);
const crawler = await setup.createCrawler();
log.info('Configuration completed. Starting the scrape.');
await crawler.run();
log.info('Web Scraper finished.');
});
/**
* This example demonstrates how to use PuppeteerCrawler in combination with RequestQueue
* to recursively scrape Hacker News website (https://news.ycombinator.com)
* using headless Chrome / Puppeteer.
* The crawler starts with a single URL, finds links to next pages,
* enqueues them and continues until no more desired links are available.
* The results are stored to the default dataset. In local configuration,
* the results are stored as JSON files in `./apify_storage/datasets/default`
*/
const Apify = require('apify');
Apify.main(async () => {
// Apify.openRequestQueue() is a factory to get a preconfigured RequestQueue instance.
// We add our first request to it - the initial page the crawler will visit.
const requestQueue = await Apify.openRequestQueue();
await requestQueue.addRequest({ url: 'https://news.ycombinator.com/' });
// Create an instance of the PuppeteerCrawler class - a crawler
// that automatically loads the URLs in headless Chrome / Puppeteer.
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
// Here you can set options that are passed to the Apify.launchPuppeteer() function.
// For example, you can set "slowMo" to slow down Puppeteer operations to simplify debugging
launchPuppeteerOptions: { slowMo: 500 },
// Stop crawling after several pages
maxRequestsPerCrawl: 10,