How to use the sacremoses.MosesDetokenizer function in sacremoses

To help you get started, we’ve selected a few sacremoses examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Findus23 / se-simulator / markov.py View on Github external
import markovify
from sacremoses import MosesDetokenizer, MosesTokenizer

tokenizer = MosesTokenizer()
detokenizer = MosesDetokenizer()


class MarkovText(markovify.Text):
    def word_split(self, sentence):
        return tokenizer.tokenize(sentence)

    def word_join(self, words):
        return detokenizer.detokenize(words, return_str=True)


class MarkovUserName(markovify.Text):
    def word_split(self, word):
        return list(word)

    def word_join(self, characters):
        return "".join(characters)
github eladhoffer / seq2seq.pytorch / seq2seq / tools / tokenizer.py View on Github external
def enable_moses(self, lang='en', tokenize=True, detokenize=True):
        if tokenize:
            self._moses_tok = MosesTokenizer(lang=lang)
        else:
            self._moses_tok = None

        if detokenize:
            self._moses_detok = MosesDetokenizer(lang=lang)
        else:
            self._moses_detok = None
github dmlc / gluon-nlp / src / gluonnlp / data / tokenizers.py View on Github external
def __init__(self, lang: str = 'en', vocab: Optional[Vocab] = None):
        self._lang = lang
        self._vocab = vocab
        if lang == 'zh':
            warnings.warn('You may not use MosesTokenizer for Chinese sentences because it is '
                          'not accurate. Try to use JiebaTokenizer. You may also tokenize the '
                          'chinese sentence to characters and learn a BPE.')
        self._tokenizer = sacremoses.MosesTokenizer(lang=lang)
        self._detokenizer = sacremoses.MosesDetokenizer(lang=lang)

        # Here, we need to warm-up the tokenizer to compile the regex
        # This will boost the performance in MacOS
        # For benchmarking results, see
        # https://gist.github.com/sxjscience/f59d2b88262fefd4fb08565c9dec6099
        self._warmup()
github kedz / nnsum / nnsum2 / seq2seq / simple_postprocessor.py View on Github external
def init_object(self):
        self._detokenizer = MosesDetokenizer()
        self._eos_pat = " ?" + self.vocab.stop_token
        self._sent_pat = r" (.)"
github freewym / espresso / examples / roberta / wsc / wsc_utils.py View on Github external
def get_detokenizer():
    from sacremoses import MosesDetokenizer
    detok = MosesDetokenizer(lang='en')
    return detok
github PyThaiNLP / pythainlp / pythainlp / translate / th2en_bpe2bpe.py View on Github external
# -*- coding: utf-8 -*-
import os
from functools import partial

from pythainlp.tools import get_full_data_path, get_pythainlp_data_path
from fairseq.models.transformer import TransformerModel
from sacremoses import MosesDetokenizer
from pythainlp.tokenize import word_tokenize as th_word_tokenize

en_word_detokenize = MosesDetokenizer("en")
th_word_tokenize = partial(th_word_tokenize, keep_whitespace=False)


def get_path(model, path1, path2, file=None):
    path = os.path.join(os.path.join(get_full_data_path(model), path1), path2)
    if file is not None:
        return os.path.join(path, file)
    return os.path.join(path, "")


th2en_bpe_model = TransformerModel.from_pretrained(
    model_name_or_path=get_path(
        "scb_1m_th-en_spm",
        "SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0",
        "models",
    ),
github NVIDIA / DeepLearningExamples / PyTorch / Translation / GNMT / seq2seq / data / tokenizer.py View on Github external
def init_moses(self, lang):
        self.moses_tokenizer = sacremoses.MosesTokenizer(lang['src'])
        self.moses_detokenizer = sacremoses.MosesDetokenizer(lang['tgt'])
github PyThaiNLP / pythainlp / pythainlp / translate / th2en_word2word.py View on Github external
# -*- coding: utf-8 -*-
import os
from pythainlp.tools import get_full_data_path, get_pythainlp_data_path
from fairseq.models.transformer import TransformerModel
from sacremoses import MosesDetokenizer
from pythainlp.tokenize import word_tokenize as th_word_tokenize

en_word_detokenize = MosesDetokenizer("en")


def get_path(model, path1, path2, file=None):
    path = os.path.join(os.path.join(get_full_data_path(model), path1), path2)
    if file is not None:
        return os.path.join(path, file)
    return os.path.join(path, "")


th2en_word2word_model = TransformerModel.from_pretrained(
    model_name_or_path=get_path(
        "scb_1m_th-en_newmm",
        "SCB_1M+TBASE_th-en_newmm-moses_130000-130000_v1.0",
        "models",
    ),
    checkpoint_file="checkpoint.pt",