Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def aleph_emit(context, data):
if not settings.ALEPH_HOST:
context.log.warning("No $MEMORIOUS_ALEPH_HOST, skipping upload...")
return
if not settings.ALEPH_API_KEY:
context.log.warning("No $MEMORIOUS_ALEPH_API_KEY, skipping upload...")
return
session_id = 'memorious:%s' % context.crawler.name
api = AlephAPI(settings.ALEPH_HOST, settings.ALEPH_API_KEY,
session_id=session_id)
collection_id = get_collection_id(context, api)
if collection_id is None:
context.log.warning("Cannot get aleph collection.")
return
content_hash = data.get('content_hash')
source_url = data.get('source_url', data.get('url'))
def aleph_emit(context, data):
if not settings.ALEPH_HOST:
context.log.warning("No $MEMORIOUS_ALEPH_HOST, skipping upload...")
return
if not settings.ALEPH_API_KEY:
context.log.warning("No $MEMORIOUS_ALEPH_API_KEY, skipping upload...")
return
session_id = 'memorious:%s' % context.crawler.name
api = AlephAPI(settings.ALEPH_HOST, settings.ALEPH_API_KEY,
session_id=session_id)
collection_id = get_collection_id(context, api)
if collection_id is None:
context.log.warning("Cannot get aleph collection.")
return
content_hash = data.get('content_hash')
source_url = data.get('source_url', data.get('url'))
foreign_id = data.get('foreign_id', data.get('request_id', source_url))
if context.skip_incremental(collection_id, foreign_id, content_hash):
context.log.info("Skip aleph upload: %s", foreign_id)
return
meta = {
'crawler': context.crawler.name,
'foreign_id': foreign_id,
def cli(debug, cache, incremental):
"""Crawler framework for documents and structured scrapers."""
settings.HTTP_CACHE = cache
settings.INCREMENTAL = incremental
settings.DEBUG = debug
if settings.DEBUG:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
init_memorious()
def load_manager():
if not hasattr(settings, '_manager'):
from memorious.logic.manager import CrawlerManager
settings._manager = CrawlerManager()
if settings.CONFIG_PATH:
settings._manager.load_path(settings.CONFIG_PATH)
return settings._manager
def log_operation_start(context):
if settings.REDIS_HOST:
r = redis.Redis(connection_pool=redis_pool)
crawler_name = context.crawler.name
stage_name = context.stage.name
r.incr(crawler_name)
r.incr(crawler_name + ":" + stage_name)
r.incr(crawler_name + ":total_ops")
r.set(crawler_name + ":last_run", datetime.datetime.now())
else:
pass
def reset(self):
self.session = Session()
self.session.headers['User-Agent'] = settings.USER_AGENT
if self.context.crawler.stealthy:
self.session.headers['User-Agent'] = UserAgent().random()
return self.session
self.manager = manager
self.source_file = source_file
with io.open(source_file, encoding='utf-8') as fh:
self.config_yaml = fh.read()
self.config = yaml.safe_load(self.config_yaml)
self.name = os.path.basename(source_file)
self.name = self.config.get('name', self.name)
self.description = self.config.get('description', self.name)
self.category = self.config.get('category', 'scrape')
self.schedule = self.config.get('schedule')
self.disabled = self.config.get('disabled', False)
self.init_stage = self.config.get('init', 'init')
self.delta = Crawler.SCHEDULES.get(self.schedule)
self.delay = int(self.config.get('delay', 0))
self.expire = int(self.config.get('expire', settings.EXPIRE)) * 84600
self.stealthy = self.config.get('stealthy', False)
self.aggregator_config = self.config.get('aggregator', {})
self.stages = {}
for name, stage in self.config.get('pipeline', {}).items():
self.stages[name] = CrawlerStage(self, name, stage)
def load_datastore():
if not hasattr(settings, '_datastore'):
if not settings.DATASTORE_URI:
raise RuntimeError("No $MEMORIOUS_DATASTORE_URI.")
# do not pool connections for the datastore
engine_kwargs = {'poolclass': NullPool}
settings._datastore = dataset.connect(settings.DATASTORE_URI,
engine_kwargs=engine_kwargs)
# Use bigint to store integers by default
settings._datastore.types.integer = settings._datastore.types.bigint
return settings._datastore
def periodic(self):
if self.scheduler.check() and not settings.DEBUG:
log.info("Running scheduled crawlers ...")
self.scheduler.update()
manager.run_scheduled()