Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
prev = next_letter #update always
return
def save(self,filename):
with codecs.open(filename,"w","utf-8") as fp:
d = {}
for k,v in self.letter2.items():
for k2,v2 in v.items():
if v2 == 0:
continue
d[k+k2] = v2
for k,v in sorted(d.items(),key=operator.itemgetter(1),reverse=True):
fp.write(u"%s - %d\n"%(k,v))
return True
class Trigram(Unigram):
def __init__(self,filename):
Unigram.__init__(self,filename)
self.letter3 = dict()
def language_model(self,verbose=True):
""" builds a Tamil bigram letter model """
# use a generator in corpus
p2 = None
p1 = None
for next_letter in self.corpus.next_tamil_letter():
# update frequency from corpus
if p2:
trig = p2+p1+next_letter
self.letter3[trig] = 1 + self.letter3.get(trig,0)
p2 = p1
p1 = next_letter #update always
def __init__(self):
self.filename = u'tamilvu_dictionary_words.txt'
self.unigram = Unigram(self.filename)
self.unigram.frequency_model()
print(u"--- completed Unigram model ---")
self.bigram = Bigram(self.filename)
self.bigram.language_model(verbose=False)
self.trigram = Trigram(self.filename)
self.trigram.language_model(verbose=False)
print(u"--- completed Bigram,Trigram model ---")
def frequency_model( self ):
""" build a letter frequency model for Tamil letters from a corpus """
# use a generator in corpus
for next_letter in self.corpus.next_tamil_letter():
# update frequency from corpus
self.letter[next_letter] = self.letter[next_letter] + 1
def save(self,filename):
with codecs.open(filename,"w","utf-8") as fp:
for k,v in sorted(self.letter.items(),key=operator.itemgetter(1),reverse=True):
if v == 0:
continue
fp.write(u"%s - %d\n"%(k,v))
return True
class Bigram(Unigram):
def __init__(self,filename):
Unigram.__init__(self,filename)
self.letter2 = dict()
for k in tamil.utf8.tamil_letters:
self.letter2[k] = copy.copy( self.letter )
def language_model(self,verbose=True):
""" builds a Tamil bigram letter model """
# use a generator in corpus
prev = None
for next_letter in self.corpus.next_tamil_letter():
# update frequency from corpus
if prev:
self.letter2[prev][next_letter] += 1
if ( verbose ) :
print(prev)