Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import markovify
from sacremoses import MosesDetokenizer, MosesTokenizer
tokenizer = MosesTokenizer()
detokenizer = MosesDetokenizer()
class MarkovText(markovify.Text):
def word_split(self, sentence):
return tokenizer.tokenize(sentence)
def word_join(self, words):
return detokenizer.detokenize(words, return_str=True)
class MarkovUserName(markovify.Text):
def word_split(self, word):
return list(word)
def word_join(self, characters):
return "".join(characters)
def enable_moses(self, lang='en', tokenize=True, detokenize=True):
if tokenize:
self._moses_tok = MosesTokenizer(lang=lang)
else:
self._moses_tok = None
if detokenize:
self._moses_detok = MosesDetokenizer(lang=lang)
else:
self._moses_detok = None
def __init__(self, lang: str = 'en', vocab: Optional[Vocab] = None):
self._lang = lang
self._vocab = vocab
if lang == 'zh':
warnings.warn('You may not use MosesTokenizer for Chinese sentences because it is '
'not accurate. Try to use JiebaTokenizer. You may also tokenize the '
'chinese sentence to characters and learn a BPE.')
self._tokenizer = sacremoses.MosesTokenizer(lang=lang)
self._detokenizer = sacremoses.MosesDetokenizer(lang=lang)
# Here, we need to warm-up the tokenizer to compile the regex
# This will boost the performance in MacOS
# For benchmarking results, see
# https://gist.github.com/sxjscience/f59d2b88262fefd4fb08565c9dec6099
self._warmup()
def init_object(self):
self._detokenizer = MosesDetokenizer()
self._eos_pat = " ?" + self.vocab.stop_token
self._sent_pat = r" (.)"
def get_detokenizer():
from sacremoses import MosesDetokenizer
detok = MosesDetokenizer(lang='en')
return detok
# -*- coding: utf-8 -*-
import os
from functools import partial
from pythainlp.tools import get_full_data_path, get_pythainlp_data_path
from fairseq.models.transformer import TransformerModel
from sacremoses import MosesDetokenizer
from pythainlp.tokenize import word_tokenize as th_word_tokenize
en_word_detokenize = MosesDetokenizer("en")
th_word_tokenize = partial(th_word_tokenize, keep_whitespace=False)
def get_path(model, path1, path2, file=None):
path = os.path.join(os.path.join(get_full_data_path(model), path1), path2)
if file is not None:
return os.path.join(path, file)
return os.path.join(path, "")
th2en_bpe_model = TransformerModel.from_pretrained(
model_name_or_path=get_path(
"scb_1m_th-en_spm",
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0",
"models",
),
def init_moses(self, lang):
self.moses_tokenizer = sacremoses.MosesTokenizer(lang['src'])
self.moses_detokenizer = sacremoses.MosesDetokenizer(lang['tgt'])
# -*- coding: utf-8 -*-
import os
from pythainlp.tools import get_full_data_path, get_pythainlp_data_path
from fairseq.models.transformer import TransformerModel
from sacremoses import MosesDetokenizer
from pythainlp.tokenize import word_tokenize as th_word_tokenize
en_word_detokenize = MosesDetokenizer("en")
def get_path(model, path1, path2, file=None):
path = os.path.join(os.path.join(get_full_data_path(model), path1), path2)
if file is not None:
return os.path.join(path, file)
return os.path.join(path, "")
th2en_word2word_model = TransformerModel.from_pretrained(
model_name_or_path=get_path(
"scb_1m_th-en_newmm",
"SCB_1M+TBASE_th-en_newmm-moses_130000-130000_v1.0",
"models",
),
checkpoint_file="checkpoint.pt",