Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
document_id = document_data['id']
response = requests.get(url)
# Sometimes, Metro Legistar has a URL that retuns a bad status code (e.g., 404 from http://metro.legistar1.com/metro/attachments/95d5007e-720b-4cdd-9494-c800392b9265.pdf).
# Skip these documents.
if response.status_code != 200:
logger.error('Document URL {} returns {} - Could not get attachment text!'.format(url, response.status_code))
continue
extension = os.path.splitext(url)[1]
with tempfile.NamedTemporaryFile(suffix=extension) as tfp:
tfp.write(response.content)
try:
plain_text = textract.process(tfp.name)
except textract.exceptions.ShellError as e:
logger.error('{} - Could not convert Document ID {}!'.format(e, document_id))
continue
logger.info('Document ID {} - conversion complete'.format(document_id))
yield {'plain_text': plain_text.decode('utf-8'), 'id': document_id}
def convert(self):
"""Converte o PDF baixado anteriormente.
Faz download de um arquivo PDF, executa o 'parser' do BeautifulSoup
e transforma o mesmo em uma 'string' utilizando o textract:
http://textract.readthedocs.io/en/stable/
"""
import textract
source_file = self.download()
try:
source_binary = textract.process(self.filename, encoding='utf_8',
method='pdftotext', layout=True)
soup = BeautifulSoup(source_binary, "html.parser")
text_string = soup.prettify(formatter=None)
except textract.exceptions.ShellError:
# TODO: implementar uma maneira de lidar com arquivos nao PDF.
print('Not a pdf')
raise NameError('The file is not a .pdf')
# Apaga o arquivo baixado caso não esteja explícito para salvar
if not self.save:
os.remove(self.filename)
return text_string