Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
opts.userAgent = opts.userAgent || "node-webcheck";
opts.retries = opts.retries || 3;
opts.retryTimeout = opts.retryTimeout || 10000;
opts.skipForeigners = opts.skipForeigners || false; // completly skip urls of other domains than the baseURL
opts.followForeigners = opts.followForeigners || false; // just check links to foraign domains, but don't check links on this pages
var emit = opts.eventEmitter = opts.eventEmitter || function(){}; // Add eventemitter if you want to...
if (opts.forceUTF8 === undefined) opts.forceUTF8 = true;
// at this moment duplicates are all skipped... // if (opts.skipDuplicates === undefined) opts.skipDuplicates = true;
var res = {};
emit("base", url);
var c = new Crawler({
"maxConnections": opts.maxConnections,
"timeout": opts.timeout,
"userAgent": opts.userAgent,
"retries": opts.retries,
"retryTimeout": opts.retryTimeout,
"forceUTF8": opts.forceUTF8,
"callback": function(error,result,$) {
if (error) {
return emit("crawlingError", error); //res[result.window.location.href] = {};
}
try {
var po = res[result.request.href] = {};
po.url = result.request.href;
po.status = result.statusCode;
po.headers = result.headers;
var Crawler = require("crawler").Crawler;
var Firebase = require('firebase');
var firebaseDB = new Firebase('https://berkeleydir.firebaseio.com/users');
var url = 'https://calnet.berkeley.edu/directory/details.pl?uid=';
var urls = [];
var numbers = [];
for(var i = 514101; i < 1200000; i++) {
urls.push(url + i);
}
var c = new Crawler({
"maxConnections": 10,
// This will be called for each crawled page
"callback": function(error, result ,$) {
// $ is a jQuery instance scoped to the server-side DOM of the page
var name = $('#content > p span:nth-child(2)').html();
if (name) {
var id = result.window.location._url.query.replace('uid=', '');
var email = $('#content span:contains("Email:")').next().text();
var person = {
id: parseInt(id, 10),
name: name,
email: email
};
firebaseDB.child(id).set(person);
console.log(person);
var proxyCrawler = function(settings){
this.settings = settings;
crawler = new Crawler({
maxConnections: 1,
timeout: (1000 * 20)
});
console.log("start proxy crawler.");
}
applicationReady.then(function()
{
let window = Services.wm.getMostRecentWindow("navigator:browser");
run(window, urls, timeout, maxtabs, baseURL + "save", function()
{
Services.startup.quit(Services.startup.eAttemptQuit);
});
}, function(exception)
{
Application_Session.prototype.run = function( finisher, catcher )
{
this.finisher = finisher;
this.catcher = catcher;
if ( !this.runnable )
{
this._run_catch( new Error( "Application_Session is not runnable" ) );
return;
}
this.runnable = false;
this.current_crawler = new Crawler(
this.instructions, this.outputs, this.window,
this.time_limit, this.leave_open, this.n_tabs
);
if ( this.progress )
{
/*
* Add an instance-specific notice member to the crawler's progress instance. This is cleaner than
* bothering with a subclass of the progress-notification class.
*/
this.current_crawler.progress.notice = function( notice )
{
notice( this );
}.bind( this.current_crawler.progress, this.progress );
}
function PageProcessor(name,config,performSegue){
this.name=name;
this.type=config.type;
this.performSegue=performSegue;
//init seques
this.segues={};
for(var i=0;i
$.get('/stats').done(function (data) {
if(data["queue.check"]){
$("#checker_task_num").text(data["queue.check"].pop+" / "+data["queue.check"].push+", valid: "+data["checker.url"].valid_seed);
option.series[0].data[0].value = ((data["queue.check"].pop/data["queue.check"].push)*100).toFixed(2) - 0;
}
if(data["crawler.pipeline"]){
$("#crawler_task_num").text(safeGetValue(data["crawler.pipeline"].finished)+" / "+data["crawler.pipeline"].total+", error: "+safeGetValue(data["crawler.pipeline"].error)+", break: "+safeGetValue(data["crawler.pipeline"].break)+", queue: "+safeGetValue(data["queue.fetch"].pop)+" / "+data["queue.fetch"].push);
option.series[1].data[0].value = (((parseInt(safeGetValue(data["queue.fetch"].pop)))/parseInt(safeGetValue(data["queue.fetch"].push)))*100).toFixed(2) - 0;
}
myChart.setOption(option, true);
});
$.get('/stats').done(function (data) {
if(data["queue.check"]){
$("#checker_task_num").text(data["queue.check"].pop+" / "+data["queue.check"].push+", valid: "+data["checker.url"].valid_seed);
option.series[0].data[0].value = ((data["queue.check"].pop/data["queue.check"].push)*100).toFixed(2) - 0;
}
if(data["crawler.pipeline"]){
$("#crawler_task_num").text(safeGetValue(data["crawler.pipeline"].finished)+" / "+data["crawler.pipeline"].total+", error: "+safeGetValue(data["crawler.pipeline"].error)+", break: "+safeGetValue(data["crawler.pipeline"].break)+", queue: "+safeGetValue(data["queue.fetch"].pop)+" / "+data["queue.fetch"].push);
option.series[1].data[0].value = (((parseInt(safeGetValue(data["queue.fetch"].pop)))/parseInt(safeGetValue(data["queue.fetch"].push)))*100).toFixed(2) - 0;
}
myChart.setOption(option, true);
});
$.get('/stats').done(function (data) {
if(data["queue.check"]){
$("#checker_task_num").text(data["queue.check"].pop+" / "+data["queue.check"].push+", valid: "+data["checker.url"].valid_seed);
}
if(data["crawler.pipeline"]){
$("#crawler_task_num").text(safeGetValue(data["crawler.pipeline"].finished)+" / "+data["crawler.pipeline"].total+", error: "+safeGetValue(data["crawler.pipeline"].error)+", break: "+safeGetValue(data["crawler.pipeline"].break)+", queue: "+safeGetValue(data["queue.fetch"].pop)+" / "+data["queue.fetch"].push);
}
});
$.get('/stats').done(function (data) {
if(data["queue.check"]){
$("#checker_task_num").text(data["queue.check"].pop+" / "+data["queue.check"].push+", valid: "+data["checker.url"].valid_seed);
}
if(data["crawler.pipeline"]){
$("#crawler_task_num").text(safeGetValue(data["crawler.pipeline"].finished)+" / "+data["crawler.pipeline"].total+", error: "+safeGetValue(data["crawler.pipeline"].error)+", break: "+safeGetValue(data["crawler.pipeline"].break)+", queue: "+safeGetValue(data["queue.fetch"].pop)+" / "+data["queue.fetch"].push);
}
});