Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def parse_pdf(file_obj):
pdf = pdfplumber.load(file_obj)
# Note: As of Nov. 2019 file, first page is documentation
checks_gen = map(parse_page, pdf.pages[1:])
checks = pd.concat(checks_gen).reset_index(drop=True)
return checks[checks["state"] != "Totals"]
def main():
args = parse_args()
pdf = pdfplumber.load(args.infile, pages=args.pages)
if args.format == "csv":
to_csv(pdf, args.types, args.encoding)
else:
to_json(pdf, args.types, args.encoding)
import requests
import datetime
import re
from io import BytesIO
def parse_date(pdf):
text = pdf.pages[0].extract_text(x_tolerance=5)
date_pat = r"UPDATED:\s+As of (.+)\n"
updated_date = re.search(date_pat, text).group(1)
d = datetime.datetime.strptime(updated_date, "%B %d, %Y")
return d
if __name__ == "__main__":
URL = "https://www.fbi.gov/file-repository/active_records_in_the_nics-index.pdf"
raw = requests.get(URL).content
pdf = pdfplumber.load(BytesIO(raw))
d = parse_date(pdf)
print(d.strftime("%Y-%m"))