Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def tokenize_file(
language,
processes,
xml_escape,
aggressive_dash_splits,
protected_patterns,
custom_nb_prefixes,
encoding,
quiet
):
moses = MosesTokenizer(lang=language,
custom_nonbreaking_prefixes_file=custom_nb_prefixes)
if protected_patterns:
with open(protected_patterns, encoding="utf8") as fin:
protected_patterns = [pattern.strip() for pattern in fin.readlines()]
moses_tokenize = partial(
moses.tokenize,
return_str=True,
aggressive_dash_splits=aggressive_dash_splits,
escape=xml_escape,
protected_patterns=protected_patterns,
)
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
def tokenize_file(
language,
processes,
xml_escape,
aggressive_dash_splits,
protected_patterns,
custom_nb_prefixes,
encoding,
quiet
):
moses = MosesTokenizer(lang=language,
custom_nonbreaking_prefixes_file=custom_nb_prefixes)
if protected_patterns:
with open(protected_patterns, encoding="utf8") as fin:
protected_patterns = [pattern.strip() for pattern in fin.readlines()]
moses_tokenize = partial(
moses.tokenize,
return_str=True,
aggressive_dash_splits=aggressive_dash_splits,
escape=xml_escape,
protected_patterns=protected_patterns,
)
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
def __init__(self, lang="en", custom_nonbreaking_prefixes_file=None):
# Initialize the object.
super(MosesTokenizer, self).__init__()
self.lang = lang
# Initialize the language specific nonbreaking prefixes.
self.NONBREAKING_PREFIXES = [
_nbp.strip() for _nbp in nonbreaking_prefixes.words(lang)
]
# Load custom nonbreaking prefixes file.
if custom_nonbreaking_prefixes_file:
self.NONBREAKING_PREFIXES = []
with open(custom_nonbreaking_prefixes_file, 'r') as fin:
for line in fin:
line = line.strip()
if line and not line.startswith("#"):
if line not in self.NONBREAKING_PREFIXES:
self.NONBREAKING_PREFIXES.append(line)