How to use the apify.utils function in apify

To help you get started, we’ve selected a few apify examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github apifytech / apify-js / examples / synchronous_run.js View on Github external
Apify.main(async () => {
    // Launch web browser.
    const browser = await Apify.launchPuppeteer();

    // Load http://goldengatebridge75.org/news/webcam.html and get an IFRAME with the webcam stream
    console.log('Opening web page...');
    const page = await browser.newPage();
    await page.goto('http://goldengatebridge75.org/news/webcam.html');
    const iframe = (await page.frames()).pop();

    // Get webcam image element handle.
    const imageElementHandle = await iframe.$('.VideoColm img');

    // Give the webcam image some time to load.
    console.log('Waiting for page to load...');
    await Apify.utils.sleep(3000);

    // Get a screenshot of that image.
    const imageBuffer = await imageElementHandle.screenshot();
    console.log('Screenshot captured.');

    // Save the screenshot as the actor's output. By convention, similarly to "INPUT",
    // the actor's output is stored in the default key-value store under the "OUTPUT" key.
    await Apify.setValue('OUTPUT', imageBuffer, { contentType: 'image/jpeg' });
    console.log('Actor finished.');
});
github apifytech / actor-scraper / scraper-tools / src / tools.js View on Github external
// from being saved to dataset. It will just contain
    // the relevant metadata.
    let result = pageFunctionResult || {};

    // Validate the result.
    const type = typeof result;
    if (type !== 'object') {
        throw new Error(`Page function must return Object | Object[], but it returned ${type}.`);
    }

    // Metadata need to be appended to each item
    // to match results with dataset "lines".
    if (!Array.isArray(result)) result = [result];
    const meta = {
        '#error': isError,
        '#debug': Apify.utils.createRequestDebugInfo(request, response),
    };

    return result.map(item => Object.assign({}, item, meta));
};
github apifytech / apify-js / examples / cheerio_crawler.js View on Github external
/**
 * This example demonstrates how to use [`CheerioCrawler`](../api/cheeriocrawler)
 * to crawl a list of URLs from an external file,
 * load each URL using a plain HTTP request, parse the HTML using <a href="https://www.npmjs.com/package/cheerio">cheerio</a>
 * and extract some data from it: the page title and all H1 tags.
 *
 * To run this example on the Apify Platform, select the `Node.js 10 on Alpine Linux (apify/actor-node-basic)` base image
 * on the source tab of your actor configuration.
 */

const Apify = require('apify');

// Apify.utils contains various utilities, e.g. for logging.
// Here we turn off the logging of unimportant messages.
const { log } = Apify.utils;
log.setLevel(log.LEVELS.WARNING);

// A link to a list of Fortune 500 companies' websites available on GitHub.
const CSV_LINK = 'https://gist.githubusercontent.com/hrbrmstr/ae574201af3de035c684/raw/f1000.csv';

// Apify.main() function wraps the crawler logic (it is optional).
Apify.main(async () =&gt; {
    // Create an instance of the RequestList class that contains a list of URLs to crawl.
    // Here we download and parse the list of URLs from an external file.
    const requestList = new Apify.RequestList({
        sources: [{ requestsFromUrl: CSV_LINK }],
    });
    await requestList.initialize();

    // Create an instance of the CheerioCrawler class - a crawler
    // that automatically loads the URLs and parses their HTML using the cheerio library.
github apifytech / apify-cli / src / templates / cheerio_crawler / main.js View on Github external
handlePageFunction: async ({ request, page }) => {
            const title = await page.title();
            console.log(`Title of ${request.url}: ${title}`);
            await Apify.pushData({
                title,
                '#debug': Apify.utils.createRequestDebugInfo(request),
            });
            await Apify.utils.enqueueLinks({ page, selector: 'a', pseudoUrls, requestQueue });
        },
github apifytech / apify-js / examples / puppeteer_crawler.js View on Github external
handleFailedRequestFunction: async ({ request }) => {
            console.log(`Request ${request.url} failed too many times`);
            await Apify.pushData({
                '#debug': Apify.utils.createRequestDebugInfo(request),
            });
        },
    });
github apifytech / apify-cli / src / templates / cheerio_crawler / main.js View on Github external
handleFailedRequestFunction: async ({ request }) => {
            console.log(`Request ${request.url} failed too many times`);
            await Apify.pushData({
                '#debug': Apify.utils.createRequestDebugInfo(request),
            });
        },
github apifytech / actor-scraper / cheerio-scraper / src / crawler_setup.js View on Github external
async _handleLinks($, request) {
        if (!(this.input.linkSelector && this.requestQueue)) return;
        const currentDepth = request.userData[META_KEY].depth;
        const hasReachedMaxDepth = this.input.maxCrawlingDepth && currentDepth >= this.input.maxCrawlingDepth;
        if (hasReachedMaxDepth) {
            log.debug(`Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`);
            return;
        }

        await Apify.utils.enqueueLinks({
            $,
            selector: this.input.linkSelector,
            pseudoUrls: this.input.pseudoUrls,
            requestQueue: this.requestQueue,
            baseUrl: request.loadedUrl,
            transformRequestFunction: (requestOptions) => {
                requestOptions.userData = {
                    [META_KEY]: {
                        parentRequestId: request.id || request.uniqueKey,
                        depth: currentDepth + 1,
                    },
                };
                requestOptions.useExtendedUniqueKey = true;
                requestOptions.keepUrlFragment = this.input.keepUrlFragments;
                return requestOptions;
            },
github apifytech / apify-cli / src / templates / basic_crawler / main.js View on Github external
handleRequestFunction: async ({ request }) => {
            await Apify.pushData({
                request,
                finishedAt: new Date(),
                html: await rp(request.url),
                '#debug': Apify.utils.createRequestDebugInfo(request),
            });
        },

apify

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

Apache-2.0
Latest version published 1 month ago

Package Health Score

84 / 100
Full package analysis