Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_parse_line_fewer_columns(self):
line = "1\tThe\tthe\tDET\tDT"
self.assertEqual(parse_line(line, fields=DEFAULT_FIELDS), Token([
('id', 1),
('form', 'The'),
('lemma', 'the'),
('upos', 'DET'),
('xpos', 'DT'),
]))
def test_empty(self):
with self.assertRaises(ParseException) as assert_context:
line = "invalid_id\t_\t_\t_\t_\t_\t_\t_\t_\t"
parse_line(line, fields=DEFAULT_FIELDS)
expected = "Failed parsing field 'id'"
self.assertEqual(str(assert_context.exception)[:len(expected)], expected)
def extract_token_info_from_companion_data(self):
annotation = []
for line in self.companion:
line = '\t'.join(line)
annotation.append(parse_line(line, DEFAULT_FIELDS))
tokens = [x["form"] for x in annotation if x["form"] is not None]
lemmas = [x["lemma"] for x in annotation if x["lemma"] is not None]
pos_tags = [x["upostag"] for x in annotation if x["upostag"] is not None]
token_range = [tuple([int(i) for i in list(x["misc"].values())[0].split(':')]) for x in annotation]
return {"tokens": tokens,
"lemmas": lemmas,
"pos_tags": pos_tags,
"token_range": token_range}
def extract_token_info_from_companion_data(self):
annotation = []
for line in self.companion:
line = '\t'.join(line)
annotation.append(parse_line(line, DEFAULT_FIELDS))
tokens = [x["form"] for x in annotation if x["form"] is not None]
lemmas = [x["lemma"] for x in annotation if x["lemma"] is not None]
pos_tags = [x["upostag"] for x in annotation if x["upostag"] is not None]
token_range = [tuple([int(i) for i in list(x["misc"].values())[0].split(':')]) for x in annotation]
return {"tokens": tokens,
"lemmas": lemmas,
"pos_tags": pos_tags,
"token_range": token_range}
def lazy_parse(text , fields = DEFAULT_FIELDS):
for sentence in text.split(u"\n\n"):
if sentence:
yield [parse_line(line, fields)
for line in sentence.split(u"\n")
if line and not line.strip().startswith(u"#")]
def extract_token_info_from_companion_data(self):
annotation = []
for line in self.companion:
line = '\t'.join(line)
annotation.append(parse_line(line, DEFAULT_FIELDS))
tokens = [x["form"] for x in annotation if x["form"] is not None]
lemmas = [x["lemma"] for x in annotation if x["lemma"] is not None]
pos_tags = [x["upostag"] for x in annotation if x["upostag"] is not None]
token_range = [tuple([int(i) for i in list(x["misc"].values())[0].split(':')]) for x in annotation]
return {"tokens": tokens,
"lemmas": lemmas,
"pos_tags": pos_tags,
"token_range": token_range}
def _lazy_parse(text: str, fields=DEFAULT_FIELDS):
"""
Reads conllu annotations, yielding unwieldy OrderedDict-like
objects per sentence.
"""
for sentence in text.split("\n\n"):
if sentence:
yield [parse_line(line, fields)
for line in sentence.split("\n")
if line and not line.strip().startswith("#")]