How to use the asrtoolkit.clean_formatting.clean_up function in asrtoolkit

To help you get started, we’ve selected a few asrtoolkit examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github finos / greenkey-asrtoolkit / tests / test_clean_up.py View on Github external
),
        (
            "net booking which grew 6% to $380 million.",
            "net booking which grew six percent to three hundred and eighty million dollars",
        ),
        (
            "to $25 dollars or $0.21 per share price.",
            "to twenty five dollars dollars or zero dollars and twenty one cents per share price",
        ),
        ("year-over-year", "year over year"),
        ("HTC VIVE", "h t c v i v e"),
    ]

    for test in tests:
        input_string = test[0]
        result = clean_up(input_string)
        assert result == test[1]
github finos / greenkey-asrtoolkit / asrtoolkit / data_structures / corpus.py View on Github external
    def count_words(self, clean_func=clean_up):
        """ Count words in a exemplar after cleaning it """
        return len(clean_func(self.transcript_file.text()).split()) if self.validate() else 0
github finos / greenkey-asrtoolkit / asrtoolkit / data_handlers / stm.py View on Github external
def format_segment(seg):
    """
    :param seg: segment object
    :return str: text for a particular STM line (see segment __str__ method)
      Formats a segment assuming it's an instance of class segment with elements
      filename, channel, speaker, start and stop times, label, and text
    """
    # clean_up used to unformat stm file text
    return " ".join([
        str(getattr(seg, _))
        for _ in ("filename", "channel", "speaker", "start", "stop", "label")
    ] + [clean_up(seg.text)])
github finos / greenkey-asrtoolkit / asrtoolkit / extract_excel_spreadsheets.py View on Github external
def clean_line(line):
    "clean up a line and test for empty values"
    return clean_up(" ".join(
        map(lambda val: str(val) if not pd.isnull(val) else "", line)))
github finos / greenkey-asrtoolkit / asrtoolkit / wer.py View on Github external
>>> standardize_transcript("this is um a test", remove_nsns=True)
    'this is a test'
    """

    # accept time_aligned_text objects but use their output text
    input_transcript = (input_transcript.text() if isinstance(
        input_transcript, time_aligned_text) else input_transcript)

    # remove tagged noises and other non-speech events
    input_transcript = re.sub(re_tagged_nonspeech, " ", input_transcript)

    if remove_nsns:
        input_transcript = remove_nonsilence_noises(input_transcript)

    # clean punctuation, etc.
    input_transcript = clean_up(input_transcript)

    return input_transcript