Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import markovify
from sacremoses import MosesDetokenizer, MosesTokenizer
tokenizer = MosesTokenizer()
detokenizer = MosesDetokenizer()
class MarkovText(markovify.Text):
def word_split(self, sentence):
return tokenizer.tokenize(sentence)
def word_join(self, words):
return detokenizer.detokenize(words, return_str=True)
class MarkovUserName(markovify.Text):
def word_split(self, word):
return list(word)
def word_join(self, characters):
def __init__(self, args):
self.args = args
if getattr(args, 'moses_source_lang', None) is None:
args.moses_source_lang = getattr(args, 'source_lang', 'en')
if getattr(args, 'moses_target_lang', None) is None:
args.moses_target_lang = getattr(args, 'target_lang', 'en')
try:
from sacremoses import MosesTokenizer, MosesDetokenizer
self.tok = MosesTokenizer(args.moses_source_lang)
self.detok = MosesDetokenizer(args.moses_target_lang)
except ImportError:
raise ImportError('Please install Moses tokenizer with: pip install sacremoses')
def tokenize_captions(captions, lang='en'):
"""Tokenizes captions list with Moses tokenizer.
"""
tokenizer = MosesTokenizer(lang=lang)
return [tokenizer.tokenize(caption, return_str=True) for caption in captions]
for sentence in sentences:
tokens_hierarchical.append(nist_tokenizer.tokenize(sentence))
elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
toktok_tokenizer = nltk.ToktokTokenizer()
for sentence in sentences:
tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence))
# Sacremoses
elif 'Sacremoses' in word_tokenizer:
if flat_tokens:
sentences = [text]
else:
sentences = wordless_sentence_tokenize(main, text, lang)
if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
moses_tokenizer = sacremoses.MosesTokenizer(lang = wordless_conversion.to_iso_639_1(main, lang))
for sentence in sentences:
tokens_hierarchical.append(moses_tokenizer.tokenize(sentence, escape = False))
elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
moses_tokenizer = sacremoses.MosesTokenizer(lang = wordless_conversion.to_iso_639_1(main, lang))
for sentence in sentences:
tokens_hierarchical.append(moses_tokenizer.penn_tokenize(sentence))
# spaCy
elif 'spaCy' in word_tokenizer:
nlp = main.__dict__[f'spacy_nlp_{lang}']
doc = nlp(text)
# See Issue #3479: https://github.com/explosion/spaCy/issues/3479
doc.is_parsed = True
if flat_tokens:
def moses_tokenize(self, text, lang):
if lang not in self.cache_moses_tokenizer:
moses_tokenizer = sm.MosesTokenizer(lang=lang)
self.cache_moses_tokenizer[lang] = moses_tokenizer
else:
moses_tokenizer = self.cache_moses_tokenizer[lang]
return moses_tokenizer.tokenize(text, return_str=False, escape=False)
for sentence in sentences:
tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence))
# Sacremoses
elif 'Sacremoses' in word_tokenizer:
if flat_tokens:
sentences = [text]
else:
sentences = wordless_sentence_tokenize(main, text, lang)
if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
moses_tokenizer = sacremoses.MosesTokenizer(lang = wordless_conversion.to_iso_639_1(main, lang))
for sentence in sentences:
tokens_hierarchical.append(moses_tokenizer.tokenize(sentence, escape = False))
elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
moses_tokenizer = sacremoses.MosesTokenizer(lang = wordless_conversion.to_iso_639_1(main, lang))
for sentence in sentences:
tokens_hierarchical.append(moses_tokenizer.penn_tokenize(sentence))
# spaCy
elif 'spaCy' in word_tokenizer:
nlp = main.__dict__[f'spacy_nlp_{lang}']
doc = nlp(text)
# See Issue #3479: https://github.com/explosion/spaCy/issues/3479
doc.is_parsed = True
if flat_tokens:
tokens_hierarchical.append([token.text for token in doc])
else:
for sentence in doc.sents:
tokens_hierarchical.append([token.text for token in sentence.as_doc()])
# syntok
def __init__(self, lang: str = 'en', vocab: Optional[Vocab] = None):
self._lang = lang
self._vocab = vocab
if lang == 'zh':
warnings.warn('You may not use MosesTokenizer for Chinese sentences because it is '
'not accurate. Try to use JiebaTokenizer. You may also tokenize the '
'chinese sentence to characters and learn a BPE.')
self._tokenizer = sacremoses.MosesTokenizer(lang=lang)
self._detokenizer = sacremoses.MosesDetokenizer(lang=lang)
# Here, we need to warm-up the tokenizer to compile the regex
# This will boost the performance in MacOS
# For benchmarking results, see
# https://gist.github.com/sxjscience/f59d2b88262fefd4fb08565c9dec6099
self._warmup()
def enable_moses(self, lang='en', tokenize=True, detokenize=True):
if tokenize:
self._moses_tok = MosesTokenizer(lang=lang)
else:
self._moses_tok = None
if detokenize:
self._moses_detok = MosesDetokenizer(lang=lang)
else:
self._moses_detok = None