How to use the crawler.Crawler function in crawler

To help you get started, we’ve selected a few crawler examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github atd-schubert / node-webcheck / lib / crawler.js View on Github external
opts.userAgent = opts.userAgent || "node-webcheck";
  opts.retries = opts.retries || 3;
  opts.retryTimeout = opts.retryTimeout || 10000;
  opts.skipForeigners = opts.skipForeigners || false; // completly skip urls of other domains than the baseURL
  opts.followForeigners = opts.followForeigners || false; // just check links to foraign domains, but don't check links on this pages
  
  var emit = opts.eventEmitter = opts.eventEmitter || function(){}; // Add eventemitter if you want to...
  
  if (opts.forceUTF8 === undefined) opts.forceUTF8 = true;
  // at this moment duplicates are all skipped... // if (opts.skipDuplicates === undefined) opts.skipDuplicates = true;
  
  var res = {};
  
  emit("base", url);
  
  var c = new Crawler({
    "maxConnections": opts.maxConnections,
    "timeout": opts.timeout,
    "userAgent": opts.userAgent,
    "retries": opts.retries,
    "retryTimeout": opts.retryTimeout,
    "forceUTF8": opts.forceUTF8,
    "callback": function(error,result,$) {
      if (error) {
        return emit("crawlingError", error); //res[result.window.location.href] = {};
      }
      try {
        var po = res[result.request.href] = {};

        po.url = result.request.href;
        po.status = result.statusCode;
        po.headers = result.headers;
github christianvuerings / berkeleydir / legacy / server-crawler.js View on Github external
var Crawler = require("crawler").Crawler;
var Firebase = require('firebase');
var firebaseDB = new Firebase('https://berkeleydir.firebaseio.com/users');

var url = 'https://calnet.berkeley.edu/directory/details.pl?uid=';
var urls = [];
var numbers = [];
for(var i = 514101; i < 1200000; i++) {
    urls.push(url + i);
}

var c = new Crawler({
    "maxConnections": 10,

    // This will be called for each crawled page
    "callback": function(error, result ,$) {
        // $ is a jQuery instance scoped to the server-side DOM of the page
        var name = $('#content > p span:nth-child(2)').html();
        if (name) {
            var id = result.window.location._url.query.replace('uid=', '');
            var email = $('#content span:contains("Email:")').next().text();
            var person = {
                id: parseInt(id, 10),
                name: name,
                email: email
            };
            firebaseDB.child(id).set(person);
            console.log(person);
github ahkimkoo / neocrawler / proxyCollector / proxyCrawler.js View on Github external
var proxyCrawler = function(settings){
	this.settings = settings;

	crawler = new Crawler({
	      maxConnections: 1,
	      timeout: (1000 * 20)
	    });

	console.log("start proxy crawler.");
}
github adblockplus / abpcrawler / lib / application.js View on Github external
Application_Session.prototype.run = function( finisher, catcher )
{
  this.finisher = finisher;
  this.catcher = catcher;
  if ( !this.runnable )
  {
    this._run_catch( new Error( "Application_Session is not runnable" ) );
    return;
  }
  this.runnable = false;

  this.current_crawler = new Crawler(
    this.instructions, this.outputs, this.window,
    this.time_limit, this.leave_open, this.n_tabs
  );


  if ( this.progress )
  {
    /*
     * Add an instance-specific notice member to the crawler's progress instance. This is cleaner than
     * bothering with a subclass of the progress-notification class.
     */
    this.current_crawler.progress.notice = function( notice )
    {
      notice( this );
    }.bind( this.current_crawler.progress, this.progress );
  }
github ltebean / spiderman / lib / pageProcessor.js View on Github external
function PageProcessor(name,config,performSegue){
	this.name=name;
	this.type=config.type;
	this.performSegue=performSegue;
	//init seques

	this.segues={};
	for(var i=0;i

crawler

Crawler is a ready-to-use web spider that works with proxies, asynchrony, rate limit, configurable request pools, jQuery, and HTTP/2 support.

MIT
Latest version published 5 months ago

Package Health Score

78 / 100
Full package analysis