Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
from six import text_type
from sacremoses.corpus import Perluniprops
from sacremoses.corpus import NonbreakingPrefixes
from sacremoses.util import is_cjk
perluniprops = Perluniprops()
nonbreaking_prefixes = NonbreakingPrefixes()
class MosesSentTokenizer(object):
"""
This is a Python port of the Moses Tokenizer from
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/ems/support/split-sentences.perl
"""
raise NotImplementedError
r"""
# Perl Unicode Properties character sets.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
from six import text_type
from sacremoses.corpus import Perluniprops
from sacremoses.corpus import NonbreakingPrefixes
from sacremoses.util import is_cjk
perluniprops = Perluniprops()
nonbreaking_prefixes = NonbreakingPrefixes()
class MosesTokenizer(object):
"""
This is a Python port of the Moses Tokenizer from
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
"""
# Perl Unicode Properties character sets.
IsN = text_type("".join(perluniprops.chars("IsN")))
IsAlnum = text_type("".join(perluniprops.chars("IsAlnum"))) # + u'्'
IsSc = text_type("".join(perluniprops.chars("IsSc")))
IsSo = text_type("".join(perluniprops.chars("IsSo")))
IsAlpha = text_type("".join(perluniprops.chars("IsAlpha")))
IsLower = text_type("".join(perluniprops.chars("IsLower")))