Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def process_input_file(client, voice, audio_config, input_file: Path, output_directory_path):
logger.info(f'Processing input file `{input_file}`')
output_file = output_directory_path / (input_file.stem + '.mp3')
logger.info(f'Target output file is: `{output_file}`')
text_to_translate = textract.process(str(input_file))
text_to_mp3(
client=client,
voice=voice,
audio_config=audio_config,
text=text_to_translate,
output_file_path=output_file
)
logger.info(f'Processing done for input file `{input_file}`')
def main():
# Extract the text from the PDF file.
text = textract.process('sample.pdf').decode('utf-8').strip()
# Symbol representing a missing ligature (unicode "unknown" glyph)
unknown_lig = u'\ufffd'
# Build the ligature map if it doesn't already exist.
if not os.path.isdir('data'):
with open('words.txt') as f:
words = set(f.read().splitlines())
lig_map = ligatures.build(words)
lig_map.save('data')
# Load the ligature map from the data directory.
lig_map = ligatures.load('data')
# Restore the missing ligatures.
_, new_text = lig_map.query_text(text, unknown_lig)
def get_features(**kwargs):
directory = kwargs['directory']
for file_path in PDFReader.get_file_list(directory, 'pdf'):
features = PDFReader.get_meta_features(file_path=file_path)
try:
features['text'] = textract.process(file_path)
features['_texta_id'] = file_path
yield features
except:
continue
def convert(filename):
text = textract.process(filename, encoding='utf-8')
if not text:
text = textract.process(filename, encoding='utf-8', method='tesseract')
return text.decode('utf-8')
def _docx_to_txt(file_path):
return textract.process(file_path).decode('utf-8')
def transcribe_document(document):
"""Store the Document file's text in the transcription field."""
try:
text = textract.process(document.file.path).strip()
if not text:
logger.debug('No text found, falling back to tesseract.')
text = textract.process(
document.file.path,
method='tesseract',
).strip()
except Exception as err:
text = None
logger.error(
'Text extraction error with file {file}: {message}'.format(
file=document.filename,
message=str(err),
)
)
if text:
document.transcription = text.decode()
document.save(transcribe=False)
def read_resume_content_txtract(self):
self.logger.println("extracting resume content using textract")
self.resume_content = []
# idxs of files that don't have content
remove_files_idxs = []
for idx, filename in enumerate(self.dataset_filenames):
self.logger.println("extracting from resume %s/%s using txtract" % (idx+1, len(self.dataset_filenames)) )
# append filename + ext to path
filepath = self.__dataset_raw_data_folder + self.__file_path_seperator + filename[0] + filename[1]
extracted_str = ""
try:
extracted_bytes = textract.process(filepath, encoding="utf_8")
extracted_str = extracted_bytes.decode("utf-8")
self.resume_content.append(extracted_str)
except:
self.logger.println("txtract threw an error")
remove_files_idxs.append(idx)
deleted_idxs = 0
for idx in remove_files_idxs:
self.logger.println("removing unprocessed resume file at index %s named %s" % (idx, self.dataset_filenames[idx]))
del self.dataset_filenames[idx-deleted_idxs]
self.logger.println("read content from %s resume files" % len(self.resume_content))