Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_accuracy(self):
output = pos_tag(u"Tổng Bí thư: Ai trót để tay nhúng chàm thì hãy sớm tự gột rửa")
self.assertEqual(len(output), 13)
def test_simple_cases(self):
sentence = u""
actual = pos_tag(sentence)
expected = []
self.assertEqual(actual, expected)
elif pos_tagger == main.tr('botok - Tibetan POS Tagger'):
word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang]
botok_tokenizer = wordless_text_utils.check_botok_tokenizers(main,
word_tokenizer = word_tokenizer)
tokens = botok_tokenizer.tokenize(' '.join(tokens))
for token in tokens:
if token.pos:
tokens_tagged.append((token.text, token.pos))
else:
tokens_tagged.append((token.text, token.chunk_type))
# Vietnamese
elif pos_tagger == main.tr('Underthesea - Vietnamese POS Tagger'):
tokens_tagged = underthesea.pos_tag(' '.join(tokens))
# Convert to Universal Tagset
if (tagset == 'custom' and main.settings_custom['pos_tagging']['to_universal_pos_tags'] or
tagset == 'universal'):
mappings = {tag: tag_universal
for tag, tag_universal, _, _ in main.settings_custom['tagsets']['mappings'][lang][pos_tagger]}
tokens_tagged = list(tokens_tagged)
# Issue warnings if any tag is missing from the mapping table
for _, tag in tokens_tagged:
if tag not in mappings:
print(f'Warning: tag "{tag}" is missing from the {wordless_conversion.to_lang_text(main, lang)} mapping table!')
tokens_tagged = [(token, mappings.get(tag, 'X'))
for token, tag in tokens_tagged]
Examples
--------
>>> # -*- coding: utf-8 -*-
>>> from underthesea import chunk
>>> sentence = "Nghi vấn 4 thi thể Triều Tiên trôi dạt bờ biển Nhật Bản"
>>> chunk(sentence)
[('Nghi vấn', 'N', 'B-NP'),
('4', 'M', 'B-NP'),
('thi thể', 'N', 'B-NP'),
('Triều Tiên', 'Np', 'B-NP'),
('trôi dạt', 'V', 'B-VP'),
('bờ biển', 'N', 'B-NP'),
('Nhật Bản', 'Np', 'B-NP')]
"""
sentence = pos_tag(sentence)
crf_model = CRFChunkingPredictor.Instance()
result = crf_model.predict(sentence, format)
return result