Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def tokenize(self, text: str) -> str:
"""Tokenizes a text and returns the tokens as a string"""
# REM_NON_PRINT_CHAR
# not implemented
# NORM_PUNC
text = self.normalizer.normalize(text)
# DESCAPE
if self.descape:
text = xml_unescape(text)
# MOSES_TOKENIZER
# see: https://github.com/facebookresearch/LASER/issues/55#issuecomment-480881573
text = self.tokenizer.tokenize(text,
return_str=True,
escape=False,
aggressive_dash_splits=False)
# jieba
if self.lang == 'zh':
text = ' '.join(jieba.cut(text.rstrip('\r\n')))
# MECAB
if self.lang == 'ja':
text = self.mecab_tokenizer.parse(text).rstrip('\r\n')