Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if not value or value == '_':
return None
if fullmatch(ID_SINGLE, value):
return int(value)
elif fullmatch(ID_RANGE, value):
from_, to = value.split("-")
from_, to = int(from_), int(to)
if to > from_:
return (int(from_), "-", int(to))
elif fullmatch(ID_DOT_ID, value):
return (int(value.split(".")[0]), ".", int(value.split(".")[1]))
raise ParseException("'{}' is not a valid ID.".format(value))
def parse_token_and_metadata(data, fields=None, field_parsers=None, metadata_parsers=None):
if not data:
raise ParseException("Can't create TokenList, no data sent to constructor.")
fields = fields or DEFAULT_FIELDS
if not field_parsers:
field_parsers = DEFAULT_FIELD_PARSERS.copy()
elif sorted(field_parsers.keys()) != sorted(fields):
new_field_parsers = DEFAULT_FIELD_PARSERS.copy()
new_field_parsers.update(field_parsers)
field_parsers = new_field_parsers
tokens = []
metadata = Metadata()
for line in data.split('\n'):
line = line.strip()
if len(line) == 1:
raise ParseException("Invalid line format, line must contain either tabs or two spaces.")
data = Token()
for i, field in enumerate(fields):
# Allow parsing CoNNL-U files with fewer columns
if i >= len(line):
break
if field in field_parsers:
try:
value = field_parsers[field](line, i)
except ParseException as e:
raise ParseException("Failed parsing field '{}': ".format(field) + str(e))
else:
value = line[i]
data[text(field)] = value
return data
def head_to_token(sentence):
if not sentence:
raise ParseException("Can't parse tree, need a tokenlist as input.")
if "head" not in sentence[0]:
raise ParseException("Can't parse tree, missing 'head' field.")
head_indexed = defaultdict(list)
for token in sentence:
# Filter out range and decimal ID:s before building tree
if "id" in token and not isinstance(token["id"], int):
continue
# Filter out tokens with negative head, they are sometimes used to
# specify tokens which should not be included in tree
if token["head"] < 0:
continue
head_indexed[token["head"]].append(token)
if len(head_indexed[0]) == 0:
raise ParseException("Found no head node, can't build tree")
head_indexed = defaultdict(list)
for token in sentence:
# Filter out range and decimal ID:s before building tree
if "id" in token and not isinstance(token["id"], int):
continue
# Filter out tokens with negative head, they are sometimes used to
# specify tokens which should not be included in tree
if token["head"] < 0:
continue
head_indexed[token["head"]].append(token)
if len(head_indexed[0]) == 0:
raise ParseException("Found no head node, can't build tree")
if len(head_indexed[0]) > 1:
raise ParseException("Can't parse tree, found multiple root nodes.")
return head_indexed
def print_tree(self, depth=0, indent=4, exclude_fields=DEFAULT_EXCLUDE_FIELDS):
if not self.token:
raise ParseException("Can't print, token is None.")
if "deprel" not in self.token or "id" not in self.token:
raise ParseException("Can't print, token is missing either the id or deprel fields.")
relevant_data = self.token.copy()
for key in exclude_fields:
if key in relevant_data:
del relevant_data[key]
node_repr = ' '.join([
'{key}:{value}'.format(key=key, value=value)
for key, value in relevant_data.items()
])
print(' ' * indent * depth + '(deprel:{deprel}) {node_repr} [{idx}]'.format(
deprel=self.token['deprel'],
def parse_int_value(value):
if value == '_':
return None
if fullmatch(INTEGER, value):
return int(value)
else:
raise ParseException("'{}' is not a valid value for parse_int_value.".format(value))