How to use the sacremoses.MosesTokenizer function in sacremoses

To help you get started, we’ve selected a few sacremoses examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Findus23 / se-simulator / markov.py View on Github external
import markovify
from sacremoses import MosesDetokenizer, MosesTokenizer

tokenizer = MosesTokenizer()
detokenizer = MosesDetokenizer()


class MarkovText(markovify.Text):
    def word_split(self, sentence):
        return tokenizer.tokenize(sentence)

    def word_join(self, words):
        return detokenizer.detokenize(words, return_str=True)


class MarkovUserName(markovify.Text):
    def word_split(self, word):
        return list(word)

    def word_join(self, characters):
github freewym / espresso / fairseq / data / encoders / moses_tokenizer.py View on Github external
def __init__(self, args):
        self.args = args

        if getattr(args, 'moses_source_lang', None) is None:
            args.moses_source_lang = getattr(args, 'source_lang', 'en')
        if getattr(args, 'moses_target_lang', None) is None:
            args.moses_target_lang = getattr(args, 'target_lang', 'en')

        try:
            from sacremoses import MosesTokenizer, MosesDetokenizer
            self.tok = MosesTokenizer(args.moses_source_lang)
            self.detok = MosesDetokenizer(args.moses_target_lang)
        except ImportError:
            raise ImportError('Please install Moses tokenizer with: pip install sacremoses')
github krasserm / fairseq-image-captioning / preprocess / tokenize_captions.py View on Github external
def tokenize_captions(captions, lang='en'):
    """Tokenizes captions list with Moses tokenizer.
    """

    tokenizer = MosesTokenizer(lang=lang)
    return [tokenizer.tokenize(caption, return_str=True) for caption in captions]
github BLKSerene / Wordless / src / wordless_text / wordless_text_processing.py View on Github external
for sentence in sentences:
                tokens_hierarchical.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence))
    # Sacremoses
    elif 'Sacremoses' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(lang = wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(moses_tokenizer.tokenize(sentence, escape = False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(lang = wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(moses_tokenizer.penn_tokenize(sentence))
    # spaCy
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if flat_tokens:
github huggingface / transformers / src / transformers / tokenization_xlm.py View on Github external
def moses_tokenize(self, text, lang):
        if lang not in self.cache_moses_tokenizer:
            moses_tokenizer = sm.MosesTokenizer(lang=lang)
            self.cache_moses_tokenizer[lang] = moses_tokenizer
        else:
            moses_tokenizer = self.cache_moses_tokenizer[lang]
        return moses_tokenizer.tokenize(text, return_str=False, escape=False)
github BLKSerene / Wordless / src / wordless_text / wordless_text_processing.py View on Github external
for sentence in sentences:
                tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence))
    # Sacremoses
    elif 'Sacremoses' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(lang = wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(moses_tokenizer.tokenize(sentence, escape = False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(lang = wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(moses_tokenizer.penn_tokenize(sentence))
    # spaCy
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if flat_tokens:
            tokens_hierarchical.append([token.text for token in doc])
        else:
            for sentence in doc.sents:
                tokens_hierarchical.append([token.text for token in sentence.as_doc()])
    # syntok
github dmlc / gluon-nlp / src / gluonnlp / data / tokenizers.py View on Github external
def __init__(self, lang: str = 'en', vocab: Optional[Vocab] = None):
        self._lang = lang
        self._vocab = vocab
        if lang == 'zh':
            warnings.warn('You may not use MosesTokenizer for Chinese sentences because it is '
                          'not accurate. Try to use JiebaTokenizer. You may also tokenize the '
                          'chinese sentence to characters and learn a BPE.')
        self._tokenizer = sacremoses.MosesTokenizer(lang=lang)
        self._detokenizer = sacremoses.MosesDetokenizer(lang=lang)

        # Here, we need to warm-up the tokenizer to compile the regex
        # This will boost the performance in MacOS
        # For benchmarking results, see
        # https://gist.github.com/sxjscience/f59d2b88262fefd4fb08565c9dec6099
        self._warmup()
github eladhoffer / seq2seq.pytorch / seq2seq / tools / tokenizer.py View on Github external
def enable_moses(self, lang='en', tokenize=True, detokenize=True):
        if tokenize:
            self._moses_tok = MosesTokenizer(lang=lang)
        else:
            self._moses_tok = None

        if detokenize:
            self._moses_detok = MosesDetokenizer(lang=lang)
        else:
            self._moses_detok = None