Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_unicode(self):
word = viet_dict_11K.words[0]
self.assertEqual(unicode, type(word))
def test_viet_dict(self):
words = viet_dict_11K.words
self.assertEqual(11373, len(words))
def compare_dictionary(model_output_folder):
# f = open(join(dirname(__file__), "logs", "crf", "new_word.txt"), "w")
# f1 = open(join(dirname(__file__), "logs", "crf", "word_in_dictionary.txt"), "w")
corpus = PlainTextCorpus()
corpus.load(model_output_folder)
new_words = []
words = []
for document in corpus.documents:
for sentences in document.sentences:
for word in sentences.split(" "):
if '_' in word:
new_words.append(word)
dictionary = viet_dict_11K.words
for word in new_words:
words.append(word.replace('_', ' '))
new_word = [x for x in words if x not in dictionary]
new_word = set(new_word)
new_word = sorted(new_word)
new_word_per_dict = float(len(new_word)) / float(len(dictionary)) * 100
# f.write("Scale word not in dictionary %0.2f: \n" % new_word_per_dict)
# for word in new_word:
# f.write(word.encode('utf-8') + "\n")
word_in_dictionary = [x for x in words if x in dictionary]
word_in_dictionary = set(word_in_dictionary)
word_in_dictionary = sorted(word_in_dictionary)
word_in_dictionary_per_total = float(len(word_in_dictionary)) / float(len(viet_dict_11K.words))
# f1.write("scale word in dictionary: %0.2f \n" % word_in_dictionary_per_total)
def predict(self, sentence):
words = viet_dict_11K.words
dictionary = [word for word in words if re.search(" $", word) is None]
tokenized_words = [word.replace(" ", "_") for word in dictionary]
s = sentence
for word, tokenized_word in zip(dictionary, tokenized_words):
if word in sentence:
s = s.replace(word, tokenized_word)
return s
dictionary = viet_dict_11K.words
for word in new_words:
words.append(word.replace('_', ' '))
new_word = [x for x in words if x not in dictionary]
new_word = set(new_word)
new_word = sorted(new_word)
new_word_per_dict = float(len(new_word)) / float(len(dictionary)) * 100
# f.write("Scale word not in dictionary %0.2f: \n" % new_word_per_dict)
# for word in new_word:
# f.write(word.encode('utf-8') + "\n")
word_in_dictionary = [x for x in words if x in dictionary]
word_in_dictionary = set(word_in_dictionary)
word_in_dictionary = sorted(word_in_dictionary)
word_in_dictionary_per_total = float(len(word_in_dictionary)) / float(len(viet_dict_11K.words))
# f1.write("scale word in dictionary: %0.2f \n" % word_in_dictionary_per_total)
# for word in word_in_dictionary:
# f1.write(word.encode('utf-8') + "\n")
return new_word, word_in_dictionary
f.write(
"IW\t" + str(confusion_matrix[1][0]) + "\t" + str(confusion_matrix[1][1]) + "\t" + str(
confusion_matrix[1][2]) + "\n")
f.write("O\t" + str(confusion_matrix[2][0]) + "\t\t" + str(confusion_matrix[2][1]) + "\t\t" + str(
confusion_matrix[2][2]) + "\n")
plt.figure()
class_name = ["BW", "IW", "O"]
Confution_Matrix.plot_confusion_matrix(confusion_matrix, classes=class_name,
title='Confusion matrix')
f.write("\n\n")
(new_word, word_in_dictionary) = compare_dictionary(model_output_folder)
f.write("Word Analysis: \n")
f.write("- Word in dictionary : %d\n" % len(word_in_dictionary))
f.write("- New Word : %d\n" % len(new_word))
coverage = float(len(new_word)) / float(len(viet_dict_11K.words))
f.write("- Word Coverage : %0.2f\n" % coverage)
f.write("\n\n")
plt.savefig('confusion matrix.png')
plt.show()
time_stop = time.time()
time_per_token = (time_stop - time_start) / float(count_token(actual_corpus.documents))
f.write("Time speed: %0.6f second per token\n" % time_per_token)
print 0