Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def detokenize_file(language, processes, xml_unescape, encoding, quiet):
moses = MosesDetokenizer(lang=language)
moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape)
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallization is slower,
# so just process line by line normally.
if processes == 1:
for line in tqdm(fin.readlines()):
print(moses_detokenize(str.split(line)), end="\n", file=fout)
else:
document_iterator = map(str.split, fin.readlines())
for outline in parallelize_preprocess(
moses_detokenize, document_iterator, processes, progress_bar=(not quiet)
):
print(outline, end="\n", file=fout)
)
moses_normalize = partial(moses.normalize)
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallization is slower,
# so just process line by line normally.
if processes == 1:
# TODO: Actually moses_normalize(fin.read()) gives the same output
# and it's a lot better but it's inconsistent with the other
# preprocessing interfaces, so we're doing it line by line here.
for line in tqdm(fin.readlines()):
# Note: not stripping newlines, so don't need end='\n' when printing to stdout.
print(moses_normalize(line), end="", file=fout)
else:
for outline in parallelize_preprocess(
moses_normalize, fin.readlines(), processes, progress_bar=(not quiet)
):
# Note: not stripping newlines, so don't need end='\n' when printing to stdout.
print(outline, end="", file=fout)
moses.tokenize,
return_str=True,
aggressive_dash_splits=aggressive_dash_splits,
escape=xml_escape,
protected_patterns=protected_patterns,
)
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallization is slower,
# so just process line by line normally.
if processes == 1:
for line in tqdm(fin.readlines()):
print(moses_tokenize(line), end="\n", file=fout)
else:
for outline in parallelize_preprocess(
moses_tokenize, fin.readlines(), processes, progress_bar=(not quiet)
):
print(outline, end="\n", file=fout)
)
moses_normalize = partial(moses.normalize)
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallization is slower,
# so just process line by line normally.
if processes == 1:
# TODO: Actually moses_normalize(fin.read()) gives the same output
# and it's a lot better but it's inconsistent with the other
# preprocessing interfaces, so we're doing it line by line here.
for line in tqdm(fin.readlines()):
# Note: not stripping newlines, so don't need end='\n' when printing to stdout.
print(moses_normalize(line), end="", file=fout)
else:
for outline in parallelize_preprocess(
moses_normalize, fin.readlines(), processes, progress_bar=(not quiet)
):
# Note: not stripping newlines, so don't need end='\n' when printing to stdout.
print(outline, end="", file=fout)
def convert_chinese(t2s, processes, encoding, quiet):
convert = simplify if t2s else tradify
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallization is slower,
# so just process line by line normally.
if processes == 1:
# TODO: Actually moses_normalize(fin.read()) gives the same output
# and it's a lot better but it's inconsistent with the other
# preprocessing interfaces, so we're doing it line by line here.
for line in tqdm(fin.readlines()):
# Note: not stripping newlines, so don't need end='\n' when printing to stdout.
print(convert(line), end="", file=fout)
else:
for outline in parallelize_preprocess(convert, fin.readlines(), processes, progress_bar=(not quiet)):
# Note: not stripping newlines, so don't need end='\n' when printing to stdout.
print(outline, end="", file=fout)
def convert_chinese(t2s, processes, encoding, quiet):
convert = simplify if t2s else tradify
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallization is slower,
# so just process line by line normally.
if processes == 1:
# TODO: Actually moses_normalize(fin.read()) gives the same output
# and it's a lot better but it's inconsistent with the other
# preprocessing interfaces, so we're doing it line by line here.
for line in tqdm(fin.readlines()):
# Note: not stripping newlines, so don't need end='\n' when printing to stdout.
print(convert(line), end="", file=fout)
else:
for outline in parallelize_preprocess(convert, fin.readlines(), processes, progress_bar=(not quiet)):
# Note: not stripping newlines, so don't need end='\n' when printing to stdout.
print(outline, end="", file=fout)
def detruecase_file(processes, is_headline, encoding, quiet):
moses = MosesDetruecaser()
moses_detruecase = partial(
moses.detruecase, return_str=True, is_headline=is_headline
)
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallization is slower,
# so just process line by line normally.
if processes == 1:
for line in tqdm(fin.readlines()):
print(moses_detruecase(line), end="\n", file=fout)
else:
for outline in parallelize_preprocess(
moses_detruecase, fin.readlines(), processes, progress_bar=(not quiet)
):
print(outline, end="\n", file=fout)
moses.tokenize,
return_str=True,
aggressive_dash_splits=aggressive_dash_splits,
escape=xml_escape,
protected_patterns=protected_patterns,
)
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallization is slower,
# so just process line by line normally.
if processes == 1:
for line in tqdm(fin.readlines()):
print(moses_tokenize(line), end="\n", file=fout)
else:
for outline in parallelize_preprocess(
moses_tokenize, fin.readlines(), processes, progress_bar=(not quiet)
):
print(outline, end="\n", file=fout)
word of a sentence is always capitalized; if this option is provided then:
a) if a sentence-initial token is *not* capitalized, then it is counted, and
b) if a capitalized sentence-initial token is the only token of the segment,
then it is counted, but with only 10% of the weight of a normal token.
:type possibly_use_first_token: bool
:returns: A dictionary of the best, known objects as values from `_casing_to_model()`
:rtype: {'best': dict, 'known': Counter}
"""
casing = defaultdict(Counter)
train_truecaser = partial(
self.learn_truecase_weights,
possibly_use_first_token=possibly_use_first_token,
)
token_weights = chain(
*parallelize_preprocess(
train_truecaser, document_iterator, processes, progress_bar=progress_bar
)
)
# Collect the token_weights from every sentence.
for lowercase_token, surface_token, weight in token_weights:
casing[lowercase_token][surface_token] += weight
# Save to file if specified.
if save_to:
self._save_model_from_casing(casing, save_to)
return self._casing_to_model(casing)