Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if block:
block = block[0].text.replace('\r\n', ' ')
pieces = block.split('--')
# if there are only two pieces, there are no abstentions
if len(pieces) <= 2:
return []
else:
# lookahead and don't split if comma precedes initials
# Also, Bell appears as Bell, Richard B. and Bell, Robert P.
# and so needs the lookbehind assertion.
return [x.strip() for x in re.split('(?
subjects = collections.defaultdict(list)
for link in self.doc.xpath('//ul[@class="linkSect"]/li/a'):
for bill_id in self.scrape_page(SubjectBillListPage, url=link.get('href')):
subjects[bill_id].append(link.text)
return subjects
class SubjectBillListPage(Page, Spatula):
def handle_page(self):
for bill in self.doc.xpath('//ul[@class="linkSect"]/li'):
link = bill.getchildren()[0]
yield str(link.text_content())
next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
if next_url:
yield from self.scrape_page_items(SubjectBillListPage, url=next_url[0])
class BillListPage(Page, Spatula):
def handle_page(self):
bills = self.doc.xpath('//ul[@class="linkSect"]/li')
for bill in bills:
link = bill.getchildren()[0]
bill_id = str(link.text_content())
if not bill_id.startswith('S') or bill_id.startswith('H'):
continue
# create a bill
desc = bill.xpath('text()')[0].strip()
chamber = {
'H': 'lower',
'S': 'upper',
}[bill_id[0]]
bill_type = {'B': 'bill',
("Committee substitute printed", SKIP),
("Bill text as passed", SKIP),
("Acts of Assembly", SKIP),
)
class SubjectPage(Page, Spatula):
def handle_page(self):
subjects = collections.defaultdict(list)
for link in self.doc.xpath('//ul[@class="linkSect"]/li/a'):
for bill_id in self.scrape_page(SubjectBillListPage, url=link.get("href")):
subjects[bill_id].append(link.text)
return subjects
class SubjectBillListPage(Page, Spatula):
def handle_page(self):
for bill in self.doc.xpath('//ul[@class="linkSect"]/li'):
link = bill.getchildren()[0]
yield str(link.text_content())
next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
if next_url:
yield from self.scrape_page_items(SubjectBillListPage, url=next_url[0])
class BillListPage(Page, Spatula):
def handle_page(self):
bills = self.doc.xpath('//ul[@class="linkSect"]/li')
for bill in bills:
link = bill.getchildren()[0]
bill_id = str(link.text_content())
for dt in self.doc.xpath('//div[@id="members"]/dl/dt'):
role = dt.text.replace(": ", "").strip().lower()
member = dt.xpath("./following-sibling::dd")[0].text_content()
member = self.clean_name(member)
comm.add_member(member, role=role)
for ul in self.doc.xpath('//div[@id="members"]/ul/li'):
member = self.clean_name(ul.text_content())
comm.add_member(member)
comm.add_source(self.url)
yield comm
class FlCommitteeScraper(Scraper, Spatula):
def scrape(self):
yield from self.scrape_page_items(SenComList)
yield from self.scrape_page_items(HouseComList)
return name, action, date
class SenateList(MemberList):
chamber = "upper"
detail_page = SenateDetail
list_xpath = '//div[@class="lColRt"]/ul/li/a'
class DelegateList(MemberList):
chamber = "lower"
detail_page = DelegateDetail
list_xpath = '//div[@class="lColLt"]/ul/li/a'
class VaPersonScraper(Scraper, Spatula):
def scrape(self, session=None):
if not session:
session = self.jurisdiction.legislative_sessions[-1]
self.info("no session specified, using %s", session["identifier"])
url = "http://lis.virginia.gov/{}/mbr/MBR.HTM".format(
SESSION_SITE_IDS[session["identifier"]]
)
committees = {}
yield from self.scrape_page_items(
SenateList, session=session, url=url, committees=committees
)
yield from self.scrape_page_items(
DelegateList, session=session, url=url, committees=committees
)
for committee in committees.values():
yield committee
('Read third time and agreed', ['passage', 'reading-3']),
('Passed (Senate|House)', 'passage'),
('Read third time and defeated', 'failure'),
('Presented', 'introduction'),
('Prefiled and ordered printed', 'introduction'),
('Read first time', 'reading-1'),
('Read second time', 'reading-2'),
('Read third time', 'reading-3'),
('Senators: ', None),
('Delegates: ', None),
('Committee substitute printed', None),
('Bill text as passed', None),
('Acts of Assembly', None),
)
class SubjectPage(Page, Spatula):
def handle_page(self):
subjects = collections.defaultdict(list)
for link in self.doc.xpath('//ul[@class="linkSect"]/li/a'):
for bill_id in self.scrape_page(SubjectBillListPage, url=link.get('href')):
subjects[bill_id].append(link.text)
return subjects
class SubjectBillListPage(Page, Spatula):
def handle_page(self):
for bill in self.doc.xpath('//ul[@class="linkSect"]/li'):
link = bill.getchildren()[0]
yield str(link.text_content())
next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
if next_url:
yield from self.scrape_page_items(SubjectBillListPage, url=next_url[0])
for source in vote.sources:
if "+vot+" in source["url"]:
vote.pupa_id = source["url"]
break
else:
vote.pupa_id = None
if vote.pupa_id in _seen_pupa_ids:
# skip over votes we've already seen
return
else:
_seen_pupa_ids.add(vote.pupa_id)
yield vote
class VaBillScraper(Scraper, Spatula):
def scrape(self, session=None):
if not session:
session = self.jurisdiction.legislative_sessions[-1]["identifier"]
self.info("no session specified, using %s", session)
session_id = SESSION_SITE_IDS[session]
url = BASE_URL + URL_PATTERNS["list"].format(session_id)
subject_url = BASE_URL + URL_PATTERNS["subjects"].format(session_id)
subjects = self.scrape_page(SubjectPage, url=subject_url)
yield from self.scrape_page_items(
BillListPage,
url=url,
session=session,
session_id=session_id,
subjects=subjects,
)
subject = None
for line in self.lines:
if SUBJ_RE.match(line):
subject = line.lower().strip()
elif subject and BILL_RE.findall(line):
for bill in BILL_RE.findall(line):
# normalize bill id to [SH]#
bill = bill.replace("-", "")
subjects[bill].add(subject)
return subjects
class FlBillScraper(Scraper, Spatula):
def scrape(self, session=None):
# FL published a bad bill in 2019, #143
self.raise_errors = False
self.retry_attempts = 1
self.retry_wait_seconds = 3
if not session:
session = self.latest_session()
self.info("no session specified, using %s", session)
subject_url = "http://www.leg.state.fl.us/data/session/{}/citator/Daily/subindex.pdf".format(
session
)
subjects = self.scrape_page(SubjectPDF, subject_url)
url = "http://flsenate.gov/Session/Bills/{}?chamber=both".format(session)
sponsor_url = BASE_URL + URL_PATTERNS['sponsors'].format(
self.kwargs['session'],
bill_id.replace(' ', ''),
)
list(self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill))
list(self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill))
bill.subject = self.kwargs['subjects'][bill_id]
bill.add_source(bill_url)
yield bill
next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
if next_url:
yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)
class BillSponsorPage(Page, Spatula):
def handle_page(self):
for slist in self.doc.xpath('//ul[@class="linkSect"]'):
# note that first ul is origin chamber
for sponsor in slist.xpath('li'):
name = sponsor.text_content().strip()
if name.endswith(u' (chief\xa0patron)'):
name = name[:-15]
type = 'primary'
elif name.endswith(u' (chief\xa0co-patron)'):
name = name[:-18]
type = 'cosponsor'
else:
type = 'cosponsor'
self.obj.add_sponsorship(name, type, 'person', type == 'primary')
yield self.obj