Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# scrapelib setup
self.timeout = self.SCRAPELIB_TIMEOUT
self.requests_per_minute = self.SCRAPELIB_RPM
self.retry_attempts = self.SCRAPELIB_RETRY_ATTEMPTS
self.retry_wait_seconds = self.SCRAPELIB_RETRY_WAIT_SECONDS
self.follow_robots = False
# if self.PROXIES:
# self.proxies = self.PROXIES
if self.FASTMODE:
self.cache_write_only = False
cache_dir = '.cache'
self.cache_storage = scrapelib.FileCache(cache_dir)
def __init__(self, metadata, output_dir=None, strict_validation=None,
fastmode=False, options={}):
"""
Create a new Scraper instance.
:param metadata: metadata for this scraper
:param output_dir: the data directory to use
:param strict_validation: exit immediately if validation fails
"""
super(Scraper, self).__init__()
# scrapelib overrides
self.timeout = settings.SCRAPELIB_TIMEOUT
self.cache_storage = scrapelib.FileCache(settings.BILLY_CACHE_DIR)
self.requests_per_minute = settings.SCRAPELIB_RPM
self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS
self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS
if fastmode:
self.requests_per_minute = 0
self.cache_write_only = False
self.metadata = metadata
self.output_dir = output_dir
self.output_names = set()
self.options = options
# make output_dir
os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir)
super(Scraper, self).__init__()
# set options
self.jurisdiction = jurisdiction
self.datadir = datadir
# scrapelib setup
self.timeout = settings.SCRAPELIB_TIMEOUT
self.requests_per_minute = settings.SCRAPELIB_RPM
self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS
self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS
self.verify = settings.SCRAPELIB_VERIFY
# caching
if settings.CACHE_DIR:
self.cache_storage = scrapelib.FileCache(settings.CACHE_DIR)
if fastmode:
self.requests_per_minute = 0
self.cache_write_only = False
# validation
self.strict_validation = strict_validation
# 'type' -> {set of names}
self.output_names = defaultdict(set)
# logging convenience methods
self.logger = logging.getLogger("pupa")
self.info = self.logger.info
self.debug = self.logger.debug
self.warning = self.logger.warning