Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
from six import text_type
from sacremoses.corpus import Perluniprops
from sacremoses.corpus import NonbreakingPrefixes
from sacremoses.util import is_cjk
perluniprops = Perluniprops()
nonbreaking_prefixes = NonbreakingPrefixes()
class MosesSentTokenizer(object):
"""
This is a Python port of the Moses Tokenizer from
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/ems/support/split-sentences.perl
"""
raise NotImplementedError
r"""
# Perl Unicode Properties character sets.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
from six import text_type
from sacremoses.corpus import Perluniprops
from sacremoses.corpus import NonbreakingPrefixes
from sacremoses.util import is_cjk
perluniprops = Perluniprops()
nonbreaking_prefixes = NonbreakingPrefixes()
class MosesTokenizer(object):
"""
This is a Python port of the Moses Tokenizer from
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
"""
# Perl Unicode Properties character sets.
IsN = text_type("".join(perluniprops.chars("IsN")))
IsAlnum = text_type("".join(perluniprops.chars("IsAlnum"))) # + u'्'
IsSc = text_type("".join(perluniprops.chars("IsSc")))
IsSo = text_type("".join(perluniprops.chars("IsSo")))
IsAlpha = text_type("".join(perluniprops.chars("IsAlpha")))
IsLower = text_type("".join(perluniprops.chars("IsLower")))
# Hack to enable Python2.7 to use encoding.
import sys
if sys.version_info[0] < 3:
import io
import warnings
open = io.open
warnings.warn(
str(
"You should really be using Python3!!! "
"Tick tock, tick tock, https://pythonclock.org/"
)
)
perluniprops = Perluniprops()
class MosesTruecaser(object):
"""
This is a Python port of the Moses Truecaser from
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl
"""
# Perl Unicode Properties character sets.
Lowercase_Letter = text_type("".join(perluniprops.chars("Lowercase_Letter")))
Uppercase_Letter = text_type("".join(perluniprops.chars("Uppercase_Letter")))
Titlecase_Letter = text_type("".join(perluniprops.chars("Uppercase_Letter")))
def __init__(self, load_from=None, is_asr=None, encoding="utf8"):
"""