Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def load_input(input_file):
lines = read(input_file).strip().split("\n")
content = [line.split("\t")[0] for line in lines]
content = u" ".join(content)
return content
def load_output(input_file):
lines = [text.split("\t") for text in read(input_file).strip().split("\n")]
output = [tuple(item) for item in lines]
return output
def test_1(self):
n_tokens = len(tokenize(self.text).split(" "))
start = time.time()
word_sent(self.text)
end = time.time()
duration = end - start # in seconds
if duration != 0:
speed = n_tokens / duration
print("Speed: ", speed)
self.assertGreater(speed, EXPECTED_SPEED)
def test_1(self):
n_tokens = 0
for text in self.texts:
n_tokens += len(tokenize(text).split(" "))
start = time.time()
for text in self.texts:
word_sent(text)
end = time.time()
duration = end - start # in seconds
speed = n_tokens / duration
print("Speed: ", speed)
self.assertGreater(speed, EXPECTED_SPEED)
def test_save(self):
corpus = PlainTextCorpus()
corpus.load(self.plaintext_folder)
corpus.save(self.saved_plaintext_folder)
files = listdir(self.saved_plaintext_folder)
self.assertEqual(4, len(files))
try:
shutil.rmtree(self.saved_plaintext_folder)
except Exception:
pass
def test___init__(self):
corpus = PlainTextCorpus()
self.assertIsNone(corpus.documents)
def test_text_1(self):
input = u"đi học"
output = Text(input)
self.assertTrue(is_unicode(output))
def save_temp(id, output):
temp_file = join(samples_dir, "%s.actual" % id)
content = u"\n".join([u"\t".join(item) for item in output])
write(temp_file, content)
def test_special_cases_2(self):
sentence = u"="
actual = word_tokenize(sentence)
expected = ["="]
self.assertEqual(actual, expected)
def test_special_cases_3(self):
sentence = u"=))"
actual = word_tokenize(sentence)
expected = ["=))"]
self.assertEqual(actual, expected)