Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_parse_line(self):
line = "1\tThe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t4\tdet\t_\t_"
self.assertEqual(
parse_line(line, fields=DEFAULT_FIELDS),
Token([
('id', 1),
('form', 'The'),
('lemma', 'the'),
('upos', 'DET'),
('xpos', 'DT'),
('feats', Token([('Definite', 'Def'), ('PronType', 'Art')])),
('head', 4),
('deprel', 'det'),
('deps', None),
('misc', None)
])
def test_parse_fieldparsers_alias_xupostag(self):
line = "1\t2"
custom_fieldparsers = {
"xpostag": lambda line, i: line[i] * 5,
"upostag": lambda line, i: line[i] * 5,
}
self.assertEqual(
parse_line(line, fields=["xpos", "upos"], field_parsers=custom_fieldparsers),
Token([
('xpos', "11111"),
('upos', "22222"),
])
def test_parse_line_with_spaces(self):
line = "1 The the DET DT Definite=Def|PronType=Art 4 det _ _"
with self.assertRaises(ParseException) as assert_context:
parse_line(line, fields=DEFAULT_FIELDS)
expected = "Invalid line format"
self.assertEqual(str(assert_context.exception)[:len(expected)], expected)
def test_parse_fieldparsers_alias_two_ways(self):
line = "1\t2"
custom_fieldparsers = {
"xpos": lambda line, i: line[i] * 5,
"upostag": lambda line, i: line[i] * 5,
}
self.assertEqual(
parse_line(line, fields=["xpostag", "upos"], field_parsers=custom_fieldparsers),
Token([
('xpostag', "11111"),
('upos', "22222"),
])
def test_parse_custom_fieldparsers(self):
line = "1\t2"
custom_fieldparsers = {
"id": lambda line, i: line[i] * 5,
}
self.assertEqual(
parse_line(line, fields=["id"], field_parsers=custom_fieldparsers),
Token([
('id', "11111"),
])
def test_parse_fieldparsers_doesnt_alias_when_exists(self):
line = "1\t2"
custom_fieldparsers = {
"xpos": lambda line, i: line[i] * 5,
"xpostag": lambda line, i: line[i],
"upos": lambda line, i: line[i] * 5,
"upostag": lambda line, i: line[i],
}
self.assertEqual(
parse_line(line, fields=["xpostag", "upostag"], field_parsers=custom_fieldparsers),
Token([
('xpostag', "1"),
('upostag', "2"),
])
def _lazy_parse(text: str, fields=DEFAULT_FIELDS):
"""
Reads conllu annotations, yielding unwieldy OrderedDict-like
objects per sentence.
"""
for sentence in text.split("\n\n"):
if sentence:
yield [parse_line(line, fields)
for line in sentence.split("\n")
if line and not line.strip().startswith("#")]