Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_save(self):
corpus = PlainTextCorpus()
corpus.load(self.plaintext_folder)
corpus.save(self.saved_plaintext_folder)
files = listdir(self.saved_plaintext_folder)
self.assertEqual(4, len(files))
try:
shutil.rmtree(self.saved_plaintext_folder)
except Exception:
pass
def test___init__(self):
corpus = PlainTextCorpus()
self.assertIsNone(corpus.documents)
from os.path import dirname
from os.path import join
from underthesea.corpus import PlainTextCorpus
model_name = "output_crf"
model_output_folder = join(dirname(dirname(__file__)), "data", "corpus", "train", model_name)
input_folder = join(dirname(dirname(__file__)), "data", "corpus", "train", "input")
actual_corpus = PlainTextCorpus()
actual_corpus.load(model_output_folder)
input_corpus = PlainTextCorpus()
input_corpus.load(input_folder)
f = open(join(dirname(__file__), "error_analysis", "input_word.txt"), "w")
f1 = open(join(dirname(__file__), "error_analysis", "output_word.txt"), "w")
actual_words = []
input_words = []
for a in actual_corpus.documents:
for a_sentences in a.sentences:
for a_word in a_sentences.split(' '):
actual_words.append(a_word)
for i in input_corpus.documents:
for i_sentences in i.sentences:
for i_word in i_sentences.split(' '):
input_words.append(i_word)
def compare_dictionary(model_output_folder):
# f = open(join(dirname(__file__), "logs", "crf", "new_word.txt"), "w")
# f1 = open(join(dirname(__file__), "logs", "crf", "word_in_dictionary.txt"), "w")
corpus = PlainTextCorpus()
corpus.load(model_output_folder)
new_words = []
words = []
for document in corpus.documents:
for sentences in document.sentences:
for word in sentences.split(" "):
if '_' in word:
new_words.append(word)
dictionary = viet_dict_11K.words
for word in new_words:
words.append(word.replace('_', ' '))
new_word = [x for x in words if x not in dictionary]
new_word = set(new_word)
new_word = sorted(new_word)
new_word_per_dict = float(len(new_word)) / float(len(dictionary)) * 100
from underthesea.corpus import PlainTextCorpus
def count_token(documents):
count = 0
for document in documents:
for sentences in document.sentences:
for word in sentences.split(' '):
count += 1
return count
f = open(join(dirname(__file__), "eda", "anonymous", "stats.txt"), "w")
f.write("[Statistics] Train Data Set\n")
train_folder = join(dirname(__file__), "corpus", "anonymous", "train")
train_corpus = PlainTextCorpus()
train_corpus.load(train_folder)
f.write("Total documents: %d\n" % len(train_corpus.documents))
s = pd.Series([len(d.sentences) for d in train_corpus.documents])
print(pd.Series.describe(s))
f.write("Min token in sentence %d\n" % s.describe()['min'])
f.write("Max token in sentence %d\n" % s.describe()['max'])
f.write("Total sentences: %d\n" % sum(s))
f.write("Total token: %d\n" % count_token(train_corpus.documents))
f.write("\n")
f.write("[Statistics] Test Data Set\n")
test_folder = join(dirname(__file__), "corpus", "anonymous", "test", "output")
test_corpus = PlainTextCorpus()
test_corpus.load(test_folder)
# (new_word, word_in_dictionary) = compare_dictionary(train_folder)
def load_train_sents(self):
corpus = PlainTextCorpus()
file_path = join(dirname(dirname(dirname(__file__))), "corpus", "anonymous", "train")
corpus.load(file_path)
sentences = []
for document in corpus.documents:
for sentence in document.sentences:
if sentence != "":
sentences.append(sentence)
return sentences
from underthesea.corpus import PlainTextCorpus
from os.path import join, dirname
from model import DummyModel
input_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus", "train", "input")
output_dummy_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus", "train", "output_dummy")
corpus = PlainTextCorpus()
corpus.load(input_folder)
output = PlainTextCorpus()
model = DummyModel()
for document in corpus.documents:
sentences = document.sentences
sentences = [sentence.lower() for sentence in sentences]
output = [model.predict(s) for s in sentences]
document.sentences = output
corpus.save(output_dummy_folder)
def get_data():
output_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus_2", "test", "output")
model_output_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus_2", "test", "output_%s" % model_name)
expected_corpus = PlainTextCorpus()
expected_corpus.load(output_folder)
actual_corpus = PlainTextCorpus()
actual_corpus.load(model_output_folder)
return expected_corpus, actual_corpus
# -*- coding: utf-8 -*-
from os.path import dirname
from os.path import join
import time
from model import CRFModel
from underthesea.corpus import PlainTextCorpus
start = time.time()
input_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus", "test", "input")
output_crf_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus", "test", "output_crf")
# input_folder = join(dirname(dirname(dirname(__file__))), "data", "test", "input")
# output_crf_folder = join(dirname(dirname(dirname(__file__))), "data", "test", "output")
corpus = PlainTextCorpus()
corpus.load(input_folder)
output = PlainTextCorpus()
model = CRFModel()
for document in corpus.documents:
print document.id
sentences = document.sentences
output = []
for sentence in sentences:
sentence = model.predict(sentence)
output.append(sentence)
document.sentences = output
count = 0
for document in corpus.documents:
sentences = document.sentences
count += sentences.__len__()
# path = join(dirname(dirname(dirname(__file__))), 'data', 'raw', 'train', 'output')