How to use the textract.process function in textract

To help you get started, we’ve selected a few textract examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github danthelion / doc2audiobook / doc2audiobook.py View on Github external
def process_input_file(client, voice, audio_config, input_file: Path, output_directory_path):
    logger.info(f'Processing input file `{input_file}`')
    output_file = output_directory_path / (input_file.stem + '.mp3')
    logger.info(f'Target output file is: `{output_file}`')

    text_to_translate = textract.process(str(input_file))

    text_to_mp3(
        client=client,
        voice=voice,
        audio_config=audio_config,
        text=text_to_translate,
        output_file_path=output_file
    )

    logger.info(f'Processing done for input file `{input_file}`')
github adamheins / ligatures / examples / parse_pdf.py View on Github external
def main():
    # Extract the text from the PDF file.
    text = textract.process('sample.pdf').decode('utf-8').strip()

    # Symbol representing a missing ligature (unicode "unknown" glyph)
    unknown_lig = u'\ufffd'

    # Build the ligature map if it doesn't already exist.
    if not os.path.isdir('data'):
        with open('words.txt') as f:
            words = set(f.read().splitlines())
        lig_map = ligatures.build(words)
        lig_map.save('data')

    # Load the ligature map from the data directory.
    lig_map = ligatures.load('data')

    # Restore the missing ligatures.
    _, new_text = lig_map.query_text(text, unknown_lig)
github texta-tk / texta / dataset_importer / document_reader / readers / entity / pdf_reader.py View on Github external
def get_features(**kwargs):
        directory = kwargs['directory']

        for file_path in PDFReader.get_file_list(directory, 'pdf'):
            features = PDFReader.get_meta_features(file_path=file_path)

            try:
                features['text'] = textract.process(file_path)
                features['_texta_id'] = file_path

                yield features
            except:
                continue
github wellcometrust / reach / tools / extraction.py View on Github external
def convert(filename):
    text = textract.process(filename, encoding='utf-8')
    if not text:
        text = textract.process(filename, encoding='utf-8', method='tesseract')

    return text.decode('utf-8')
github mitmedialab / DataBasic / databasic / logic / filehandler.py View on Github external
def _docx_to_txt(file_path):
    return textract.process(file_path).decode('utf-8')
github fourdigits / wagtail_textract / src / wagtail_textract / handlers.py View on Github external
def transcribe_document(document):
    """Store the Document file's text in the transcription field."""
    try:
        text = textract.process(document.file.path).strip()
        if not text:
            logger.debug('No text found, falling back to tesseract.')
            text = textract.process(
                document.file.path,
                method='tesseract',
            ).strip()

    except Exception as err:
        text = None
        logger.error(
            'Text extraction error with file {file}: {message}'.format(
                file=document.filename,
                message=str(err),
            )
        )

    if text:
        document.transcription = text.decode()
        document.save(transcribe=False)
github adamkhazi / information-extraction-system / extractor.py View on Github external
def read_resume_content_txtract(self):
        self.logger.println("extracting resume content using textract")
        self.resume_content = []
        # idxs of files that don't have content
        remove_files_idxs = []
        for idx, filename in enumerate(self.dataset_filenames):
            self.logger.println("extracting from resume %s/%s using txtract" % (idx+1, len(self.dataset_filenames)) )
            # append filename + ext to path
            filepath = self.__dataset_raw_data_folder + self.__file_path_seperator + filename[0] + filename[1]
            extracted_str = ""
            try:
                extracted_bytes = textract.process(filepath, encoding="utf_8")
                extracted_str = extracted_bytes.decode("utf-8")
                self.resume_content.append(extracted_str)
            except:
                self.logger.println("txtract threw an error")
                remove_files_idxs.append(idx)
        deleted_idxs = 0
        for idx in remove_files_idxs:
            self.logger.println("removing unprocessed resume file at index %s named %s" % (idx, self.dataset_filenames[idx]))
            del self.dataset_filenames[idx-deleted_idxs]

        self.logger.println("read content from %s resume files" % len(self.resume_content))

textract

extract text from any document. no muss. no fuss.

MIT
Latest version published 3 years ago

Package Health Score

58 / 100
Full package analysis