Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def process_input_file(client, voice, audio_config, input_file: Path, output_directory_path):
logger.info(f'Processing input file `{input_file}`')
output_file = output_directory_path / (input_file.stem + '.mp3')
logger.info(f'Target output file is: `{output_file}`')
text_to_translate = textract.process(str(input_file))
text_to_mp3(
client=client,
voice=voice,
audio_config=audio_config,
text=text_to_translate,
output_file_path=output_file
)
logger.info(f'Processing done for input file `{input_file}`')
def main():
# Extract the text from the PDF file.
text = textract.process('sample.pdf').decode('utf-8').strip()
# Symbol representing a missing ligature (unicode "unknown" glyph)
unknown_lig = u'\ufffd'
# Build the ligature map if it doesn't already exist.
if not os.path.isdir('data'):
with open('words.txt') as f:
words = set(f.read().splitlines())
lig_map = ligatures.build(words)
lig_map.save('data')
# Load the ligature map from the data directory.
lig_map = ligatures.load('data')
# Restore the missing ligatures.
_, new_text = lig_map.query_text(text, unknown_lig)
def get_features(**kwargs):
directory = kwargs['directory']
for file_path in PDFReader.get_file_list(directory, 'pdf'):
features = PDFReader.get_meta_features(file_path=file_path)
try:
features['text'] = textract.process(file_path)
features['_texta_id'] = file_path
yield features
except:
continue
def convert(filename):
text = textract.process(filename, encoding='utf-8')
if not text:
text = textract.process(filename, encoding='utf-8', method='tesseract')
return text.decode('utf-8')
def _docx_to_txt(file_path):
return textract.process(file_path).decode('utf-8')
def transcribe_document(document):
"""Store the Document file's text in the transcription field."""
try:
text = textract.process(document.file.path).strip()
if not text:
logger.debug('No text found, falling back to tesseract.')
text = textract.process(
document.file.path,
method='tesseract',
).strip()
except Exception as err:
text = None
logger.error(
'Text extraction error with file {file}: {message}'.format(
file=document.filename,
message=str(err),
)
)
if text:
document.transcription = text.decode()
document.save(transcribe=False)
def read_resume_content_txtract(self):
self.logger.println("extracting resume content using textract")
self.resume_content = []
# idxs of files that don't have content
remove_files_idxs = []
for idx, filename in enumerate(self.dataset_filenames):
self.logger.println("extracting from resume %s/%s using txtract" % (idx+1, len(self.dataset_filenames)) )
# append filename + ext to path
filepath = self.__dataset_raw_data_folder + self.__file_path_seperator + filename[0] + filename[1]
extracted_str = ""
try:
extracted_bytes = textract.process(filepath, encoding="utf_8")
extracted_str = extracted_bytes.decode("utf-8")
self.resume_content.append(extracted_str)
except:
self.logger.println("txtract threw an error")
remove_files_idxs.append(idx)
deleted_idxs = 0
for idx in remove_files_idxs:
self.logger.println("removing unprocessed resume file at index %s named %s" % (idx, self.dataset_filenames[idx]))
del self.dataset_filenames[idx-deleted_idxs]
self.logger.println("read content from %s resume files" % len(self.resume_content))
document_id = document_data['id']
response = requests.get(url)
# Sometimes, Metro Legistar has a URL that retuns a bad status code (e.g., 404 from http://metro.legistar1.com/metro/attachments/95d5007e-720b-4cdd-9494-c800392b9265.pdf).
# Skip these documents.
if response.status_code != 200:
logger.error('Document URL {} returns {} - Could not get attachment text!'.format(url, response.status_code))
continue
extension = os.path.splitext(url)[1]
with tempfile.NamedTemporaryFile(suffix=extension) as tfp:
tfp.write(response.content)
try:
plain_text = textract.process(tfp.name)
except textract.exceptions.ShellError as e:
logger.error('{} - Could not convert Document ID {}!'.format(e, document_id))
continue
logger.info('Document ID {} - conversion complete'.format(document_id))
yield {'plain_text': plain_text.decode('utf-8'), 'id': document_id}
def convert(self):
"""Converte o PDF baixado anteriormente.
Faz download de um arquivo PDF, executa o 'parser' do BeautifulSoup
e transforma o mesmo em uma 'string' utilizando o textract:
http://textract.readthedocs.io/en/stable/
"""
import textract
source_file = self.download()
try:
source_binary = textract.process(self.filename, encoding='utf_8',
method='pdftotext', layout=True)
soup = BeautifulSoup(source_binary, "html.parser")
text_string = soup.prettify(formatter=None)
except textract.exceptions.ShellError:
# TODO: implementar uma maneira de lidar com arquivos nao PDF.
print('Not a pdf')
raise NameError('The file is not a .pdf')
# Apaga o arquivo baixado caso não esteja explícito para salvar
if not self.save:
os.remove(self.filename)
return text_string
@classmethod
def get_path_details(cls, temp_path, image_path):
"""Return the byte sequence and the full text for a given path."""
byte_sequence = ByteSequence.from_path(temp_path)
extension = map_mime_to_ext(byte_sequence.mime_type, cls.mime_map)
logging.debug("Assessing MIME: %s EXTENSION %s SHA1:%s", byte_sequence.mime_type,
extension, byte_sequence.sha1)
full_text = ""
if extension is not None:
try:
logging.debug("Textract for SHA1 %s, extension map val %s",
byte_sequence.sha1, extension)
full_text = process(temp_path, extension=extension, encoding='ascii',
preserveLineBreaks=True)
except ExtensionNotSupported as _:
logging.exception("Textract extension not supported for ext %s", extension)
logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
full_text = "N/A"
except LookupError as _:
logging.exception("Lookup error for encoding.")
logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
full_text = "N/A"
except UnicodeDecodeError as _:
logging.exception("UnicodeDecodeError, problem with file encoding")
logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
full_text = "N/A"
except:
logging.exception("Textract UNEXPECTEDLY failed for temp_file.")
logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
full_text = "N/A"
return byte_sequence, full_text