Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def main(code):
epi = epitran.Epitran(code)
for line in sys.stdin: # pointless
line = line.decode('utf-8')
line = unicodedata.normalize('NFD', line.lower())
line = epi.transliterate(line)
line = line.encode('utf-8')
sys.stdout.write(line)
# dictionary.
#
######################################
setSWords = []
# setSList = open("/home/data/LoReHLT17/internal/Lexicons/orm_lexicon/setS_wordlist.txt", "r")
setSList = open("../utils/segnerfts/res/setS_wordlist.txt", "r")
for line in setSList:
setSWords.append(line.strip())
def get_freq_dist():
freq = FreqDist()
freq.update(brown.words())
freq.update(setSWords)
return freq
epi = epitran.Epitran("orm-Latn")
g2p = epi.transliterate
def stripFinalVowel(string):
if string[-2:] in ["aa", "ee", "ii", "oo", "uu"]:
return string[:-2]
elif string[-1] in ["a", "e", "i", "o", "u"]:
return string[:-1]
else:
return string
def get_dictionary(dict_filenames):
l1_to_l2 = defaultdict(list)
for dict_filename in dict_filenames:
if os.path.isfile(dict_filename):
with open(dict_filename, "r", encoding="utf-8") as fin:
def __init__(self, code, table, decompose=True, cedict_file=None):
"""Construct object for re-romanizing Epitran output.
This class converts orthographic input, via Epitran, to a more
conventional romanization that should be more readable to most humans.
Args:
code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen
table (str): Name of re-romanization table
decompose (bool): apply decomposing normalization
"""
self.epi = epitran.Epitran(code, cedict_file=cedict_file)
self.mapping = self._load_reromanizer(table, decompose)
def __init__(self, code, space_names):
"""Construct a Space object
Space objects take strings (corresponding to segments) and return
integers, placing them in an integer space that can be translated into
a one-hot vector.
The resulting object has a dictionary-like interface that supports
indexing and iteration over "keys".
Args:
code (str): ISO 639-3 code joined to ISO 15924 code with "-"
space_names (list): list of space names consisting of ISO 639-3
codes joined to ISO 15924 codes with "-"
"""
self.epi = Epitran(code)
self.dict = self._load_space(space_names)
def main(code, op, infiles, output):
epi = epitran.Epitran(code)
ft = panphon.FeatureTable()
space = Counter()
for fn in infiles:
logging.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
add_file = add_file_op if op else add_file_gen
space.update(add_file(epi, ft, fn))
print_space(output, space)
def __init__(self, code, space_names):
"""Constructs VectorWithIPASpace object
A VectorWithIPASpace object takes orthographic words, via the
word_to_segs method, and returns a list of tuples consisting of category
(letter or punctuation), lettercaase, orthographic form, phonetic form,
id within an IPA space, and articulatory feature vector.
Args:
code (str): ISO 639-3 code joined to ISO 15924 code with "-"
space_names (list): list of space names consisting of ISO 639-3
codes joined to ISO 15924 codes with "-"
"""
self.epi = Epitran(code)
self.space = Space(code, space_names)