Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def main():
conn = pymongo.Connection(settings.MONGO_HOST, settings.MONGO_PORT)
tweets = conn['openstates_web']['tweets']
data = urlopen('http://api.twitter.com/1/statuses/user_timeline.json?screen_name=openstates&count=1&trim_user=1')
data = json.loads(data)
tweets.drop()
tweets.insert(data, safe=True)
def dump_json(abbr, filename, validate, schema_dir):
scraper = scrapelib.Scraper(requests_per_minute=600, follow_robots=False)
level = metadata(abbr)['level']
zip = zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED)
if not schema_dir:
cwd = os.path.split(__file__)[0]
schema_dir = os.path.join(cwd, "../schemas/api/")
with open(os.path.join(schema_dir, "bill.json")) as f:
bill_schema = json.load(f)
with open(os.path.join(schema_dir, "legislator.json")) as f:
legislator_schema = json.load(f)
with open(os.path.join(schema_dir, "committee.json")) as f:
committee_schema = json.load(f)
def get_session_list(self):
text = scrapelib.Scraper().get("ftp://ftp.cga.ct.gov").text
sessions = [line.split()[-1] for line in text.splitlines()]
return [session for session in sessions if session not in SKIP_SESSIONS]
def scrape_vote_text(self, filelocation, local=False):
"""Retrieves or uses local copy of vote pdf and converts into XML."""
if not local:
try:
filename, response = self.urlretrieve(url=filelocation)
vote_text = convert_pdf(filename, type="xml")
os.remove(filename)
except scrapelib.HTTPError:
self.warning("Request failed: {}".format(filelocation))
return
else:
vote_text = convert_pdf(filelocation, type="xml")
os.remove(filelocation)
return vote_text
image=photo_url,
primary_org="legislature",
)
person.add_link(rep_url)
person.add_source(rep_url)
note = "Capitol Office"
person.add_contact_detail(type="address", value=address, note=note)
if phone:
person.add_contact_detail(type="voice", value=phone, note=note)
if email:
person.add_contact_detail(type="email", value=email, note=note)
yield person
except scrapelib.HTTPError:
self.warning("could not retrieve %s" % rep_url)
}[leg_info["party"]]
photo_url = leg_info["image"]
leg_id = leg_info["id"]
if leg_info["house"] == "H":
leg_url = house_base_url + "detail.jsp?i=" + leg_id
leg = Legislator(term, 'lower', district, leg_name,
party=party, photo_url=photo_url, url=leg_url)
leg.add_source(leg_url)
leg = self.scrape_house_member(leg_url, leg)
else:
leg_url = (senate_base_url +
"senators/district{dist}.html".format(dist=district))
try:
self.head(leg_url)
except HTTPError:
warning_text = "Bad link for {sen}".format(sen=leg_name)
self.logger.warning(warning_text)
leg = Legislator(term, 'upper', district, leg_name,
party=party, photo_url=photo_url)
else:
leg = Legislator(term, 'upper', district, leg_name,
party=party, photo_url=photo_url,url=leg_url)
leg.add_source(leg_url)
address = leg_info.get('address', None)
fax = leg_info.get('fax', None)
cell = leg_info.get('cell', None)
home_phone = leg_info.get('homePhone', None)
work_phone = leg_info.get('workPhone', None)
votes = vote_doc.json()
yield from self.process_vote(
votes,
vote_url,
base_url,
bill,
legislators,
chamber_dict,
vote_results,
)
vote_url = base_url
vote_url += bill_version["cmtevotes"][0]["link"]
try:
vote_doc = self.get(vote_url)
except scrapelib.HTTPError:
self.warning(
"Vote page not " "loading; skipping: {}".format(vote_url)
)
continue
votes = vote_doc.json()
yield from self.process_vote(
votes,
vote_url,
base_url,
bill,
legislators,
chamber_dict,
vote_results,
)
if data["items"][0]["effective_date"]:
# set to joint if we are using the special_committees
com_chamber = (
"legislature" if com_type == "special_committees" else chamber
)
committee = Organization(
committee_data["TITLE"],
chamber=com_chamber,
classification="committee",
)
com_url = ksapi.url + "ctte/%s/" % committee_data["KPID"]
try:
detail_json = self.get(com_url).text
except scrapelib.HTTPError:
self.warning("error fetching committee %s" % com_url)
continue
details = json.loads(detail_json)["content"]
for chair in details["CHAIR"]:
if chair.get("FULLNAME", None):
chair_name = chair["FULLNAME"]
else:
chair_name = self.parse_kpid(chair["KPID"])
self.warning("no FULLNAME for %s", chair["KPID"])
committee.add_member(chair_name, "chairman")
for vicechair in details["VICECHAIR"]:
committee.add_member(vicechair["FULLNAME"], "vice-chairman")
for rankedmember in details["RMMEM"]:
committee.add_member(rankedmember["FULLNAME"], "ranking member")
for member in details["MEMBERS"]:
committee.add_member(member["FULLNAME"])
html = self.get(url).text
doc = lxml.html.fromstring(html)
for com in doc.xpath('//h2[@class="commhighlight"]'):
members_url = com.xpath(
'following-sibling::p[1]/a[text()="Members"]/@href'
)[0]
com = Organization(com.text, chamber="lower", classification="committee")
com.add_source(members_url)
try:
member_html = self.get(members_url).text
mdoc = lxml.html.fromstring(member_html)
except HTTPError:
self.warning(
"Member list for {} failed to respond; skipping".format(com.name)
)
continue
# each legislator in their own table
# first row, second column contains all the info
for ltable in mdoc.xpath("//table/tr[1]/td[2]/p/b[1]"):
# name is tail string of last element
name = ltable.text_content()
text = ltable.text
if text and name != text:
name = name.replace(text, "")
# role is inside a nested b tag
def parse_html_vote(self, bill, actor, date, motion, url, uniqid):
try:
page = self.get(url).text
except scrapelib.HTTPError:
self.warning("A vote page not found for bill {}".format(bill.identifier))
return
page = lxml.html.fromstring(page)
page.make_links_absolute(url)
descr = page.xpath("//b")[0].text_content()
if descr == "":
# New page method
descr = page.xpath("//center")[0].text
if "on voice vote" in descr:
return
if "committee" in descr.lower():
yield from self.scrape_committee_vote(
bill, actor, date, motion, page, url, uniqid
)