Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if 'terminal' not in attrib:
cat = Category.parse(attrib['category'])
children = [rec(spans[child]) for child in attrib['child'].split(' ')]
if len(children) == 1:
return Tree.make_unary(cat, children[0], lang)
else:
assert len(children) == 2
left, right = children
combinator = guess_combinator_by_triplet(
binary_rules, cat, left.cat, right.cat)
combinator = combinator or UNKNOWN_COMBINATOR
return Tree.make_binary(cat, left, right, combinator, lang)
else:
cat = Category.parse(attrib['category'])
word = try_get_surface(tokens[attrib['terminal']])
return Tree.make_terminal(word, cat, lang)
def parse_leaf(self):
self.word_id += 1
self.check('{')
cat = self.next(' ')[1:]
cat = cat[:cat.find('_')]
cat = DEPENDENCY.sub('', cat)
cat = Category.parse(cat)
surf, base, pos1, pos2 = self.next('}')[:-1].split('/')
token = Token(surf=surf, base=base, pos1=pos1, pos2=pos2)
self.tokens.append(token)
return Tree.make_terminal(surf, cat, self.lang)
This reads the treebank while taking care of those categories.
"""
for line in open(filename):
line = line.strip()
if len(line) == 0:
continue
if line.startswith("ID"):
name = line
else:
tokens = []
for token in line.split(' '):
if token[0] == '(' and token.endswith(')[conj]'):
token = token[:-6]
tokens.append(token)
line = ' '.join(tokens)
tree, tokens = Tree.of_auto(line, lang)
yield name, tokens, tree
def rec(node):
attrib = node.attrib
if node.tag == 'rule':
cat = Category.parse(attrib['cat'])
children = [rec(child) for child in node.getchildren()]
if len(children) == 1:
return Tree.make_unary(cat, children[0], lang)
else:
assert len(children) == 2
left, right = children
combinator = guess_combinator_by_triplet(
binary_rules, cat, left.cat, right.cat)
combinator = combinator or UNKNOWN_COMBINATOR
return Tree.make_binary(cat, left, right, combinator, lang)
else:
assert node.tag == 'lf'
cat = Category.parse(attrib['cat'])
word = attrib['word']
token = Token(word=attrib['word'],
pos=attrib['pos'],
entity=attrib['entity'],
lemma=attrib['lemma'],
chunk=attrib['chunk'])
tokens.append(token)
return Tree.make_terminal(word, cat, lang)
tokens = []
op = combinators[op[1:]]
cat = DEPENDENCY.sub('', self.next(' '))
cat = Category.parse(cat)
self.check('{')
children = []
while self.peek() != '}':
children.append(self.next_node())
if self.peek() == ' ':
self.next(' ')
self.next('}')
if len(children) == 1:
return Tree.make_unary(cat, children[0], self.lang)
else:
assert len(children) == 2, f'failed to parse, invalid number of children: {self.line}'
left, right = children
return Tree.make_binary(cat, left, right, op, self.lang)
reduce(item[:-1])
if isinstance(stack[-1], str):
word = stack.pop()
category = stack.pop()
tree = Tree.make_terminal(word, category, lang)
position += 1
else:
assert isinstance(stack[-1], Tree)
children = []
while isinstance(stack[-1], Tree):
tree = stack.pop()
children.append(tree)
category = stack.pop()
if len(children) == 1:
tree = Tree.make_unary(category, children[0], lang)
elif len(children) == 2:
right, left = children
combinator = guess_combinator_by_triplet(
binary_rules, category, left.cat, right.cat)
combinator = combinator or UNKNOWN_COMBINATOR
tree = Tree.make_binary(category, left, right, combinator, lang)
else:
assert False
stack.append(tree)
stack.append(tree)
def rec() -> None:
if len(buf) == 0:
return
item = buf.pop()
assert item[0] == '(' or item[-1] == ')'
if item[0] == '(':
stack.append(Category.parse(item[1:]))
elif item[-1] == ')':
reduce(item)
rec()
try:
rec()
assert len(stack) == 1 and isinstance(stack[0], Tree)
except:
raise RuntimeError('Parse failed on an invalid CCG tree')
return stack[0], tokens
def reduce(item: str) -> None:
nonlocal position
if item[-1] != ')':
token = Token(word=item)
tokens.append(token)
stack.append(item)
return
reduce(item[:-1])
if isinstance(stack[-1], str):
word = stack.pop()
category = stack.pop()
tree = Tree.make_terminal(word, category, lang)
position += 1
else:
assert isinstance(stack[-1], Tree)
children = []
while isinstance(stack[-1], Tree):
tree = stack.pop()
children.append(tree)
category = stack.pop()
if len(children) == 1:
tree = Tree.make_unary(category, children[0], lang)
elif len(children) == 2:
right, left = children
combinator = guess_combinator_by_triplet(
binary_rules, category, left.cat, right.cat)
combinator = combinator or UNKNOWN_COMBINATOR
tree = Tree.make_binary(category, left, right, combinator, lang)
else:
assert False
stack.append(tree)