Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def dump_json(abbr, filename, validate, schema_dir):
scraper = scrapelib.Scraper(requests_per_minute=600, follow_robots=False)
level = metadata(abbr)['level']
zip = zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED)
if not schema_dir:
cwd = os.path.split(__file__)[0]
schema_dir = os.path.join(cwd, "../schemas/api/")
with open(os.path.join(schema_dir, "bill.json")) as f:
bill_schema = json.load(f)
with open(os.path.join(schema_dir, "legislator.json")) as f:
legislator_schema = json.load(f)
with open(os.path.join(schema_dir, "committee.json")) as f:
committee_schema = json.load(f)
def get_session_list(self):
text = scrapelib.Scraper().get("ftp://ftp.cga.ct.gov").text
sessions = [line.split()[-1] for line in text.splitlines()]
return [session for session in sessions if session not in SKIP_SESSIONS]
def get_session_list(self):
import scrapelib
import lxml.html
url = "http://www.legis.nd.gov/assembly/"
html = scrapelib.Scraper().get(url).text
doc = lxml.html.fromstring(html)
doc.make_links_absolute(url)
sessions = doc.xpath("//div[@class='view-content']//a/text()")
sessions = [
session for session in sessions if "Territorial Assembly" not in session
]
return sessions
def get_session_list(self):
scraper = scrapelib.Scraper(requests_per_minute=40)
vals = url_xpath('http://lis.virginia.gov', '//div[@id = "sLink"]//option[@value != "01"]/@value', requester=scraper)
sessions = [get_session_id(val, scraper) for val in vals]
return [session for session in sessions if session is not None]
#!/usr/bin/env python
from StringIO import StringIO
import argparse
import json
from datetime import datetime
import scrapelib
from lxml.html import etree
from utils import log, download, write, log_dir
#intialize scraper and parser
s = scrapelib.Scraper(requests_per_minute=60, follow_robots=False)
parser = etree.HTMLParser()
scrapelog = {
"begin" : datetime.now().strftime("%Y-%m-%d-%H:%M:%S"),
"signatures": {}
}
def petitions(start=1, mx=None):
if mx is None:
mx = -1
#log objects for tracking signatures over time
hits = 0
#scan WH site, add any new petitions to DB
#surely a better way to get indefinite number of results than to create a functionally infinite loop, then breaking it, but drawing a blank
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
text = re.sub("&#?\w+;", fixup, text)
text = remove_unicode_control(text)
return text
##### Downloading
#the os.getcwd() (current working directory) is for file system access in Cloud9IDE
import scrapelib
scraper = scrapelib.Scraper(requests_per_minute=120, follow_robots=False, retry_attempts=3)
# uses config values if present
def cache_dir():
return os.getcwd() + "/cache"
# uses config values if present
def data_dir():
return os.getcwd() + "/data"
# uses config values if present
def log_dir():
return os.getcwd() + "/log"
def download(url, destination, force=False, options=None):
if not options:
import re
import os
import csv
import time
import logging
import logging.config
from os.path import join
import scrapelib
path = '/home/thom/sunlight/python-opencivicdata/opencivicdata/division-ids/identifiers/country-us'
class Checker(scrapelib.Scraper):
OUTFILE = 'domains.csv'
SCRAPELIB_RPM = 10
SCRAPELIB_TIMEOUT = 60
SCRAPELIB_RETRY_ATTEMPTS = 0
SCRAPELIB_RETRY_WAIT_SECONDS = 20
FASTMODE = True
# PROXIES = dict(http="http://localhost", https='https://localhost')
BOGUS_DOMAIN_MESSAGE = 'Invalid parameters!!'
def __init__(self):
super().__init__()
self.checked_places = set()
logging.config.dictConfig(self.LOGGING_CONFIG)
self.logger = logging.getLogger('legistar')