Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_subtrees_for_phrase(self):
t = self._sentence.subtrees_for_phrase("NP")[0]
self.assertIsInstance(t, Tree)
self.assertEquals("property", t[-1].leaves()[0])
if node0.label() != '' and node1.label() != '':
tr = [node0, node1]
elif node0.label() == '' and node1.label() != '':
tr = [c for c in node0] + [node1]
elif node0.label() != '' and node1.label() == '':
tr = [node0] + [c for c in node1]
elif node0.label() == '' and node1.label() == '':
tr = [c for c in node0] + [c for c in node1]
arc_list = str(arcdict[arc[idx]]).split('+')
arc_list.reverse()
for a in arc_list:
if isinstance(tr, nltk.Tree):
tr = [tr]
tr = nltk.Tree(a, tr)
return tr
set:
set of syntactic productions
"""
ret = set()
# obtain token indices for each arg sentence
snt_id = None
snt2tok = self._get_snt2tok(a_rel[a_arg][TOK_LIST])
# obtain set of leaves corresponding to that argument
arg_leaves = set()
subt_leaves = set()
processed_leaves = set()
itree = itree_str = inode_path = None
for snt_id, toks in snt2tok.iteritems():
itree_str = a_parses[a_doc_id][SENTENCES][snt_id][PARSE_TREE]
itree = Tree.fromstring(itree_str)
if not itree.leaves():
print("Invalid parse tree for sentence {:d}".format(snt_id),
file=sys.stderr)
continue
# obtain all terminal syntactic nodes from the arg
for itok in toks:
inode_path = itree.leaf_treeposition(itok)
arg_leaves.add(itree[inode_path])
# check all subtrees (not efficient, but easy to implement)
for s_t in itree.subtrees():
subt_leaves.update(s_t.leaves())
if subt_leaves.issubset(arg_leaves) and \
not subt_leaves.issubset(processed_leaves):
ret.update(str(p) for p in itree.productions()
if any(is_nonterminal(n)
for n in p.rhs()))
def to_nltk_tree2(node):
if node.n_lefts + node.n_rights > 0:
return Tree(tok_format(node), [to_nltk_tree2(child) for child in node.children])
else:
return tok_format(node)
def get_nodes_for_ntlk(parent, stopwords):
keywords = []
for node in parent:
if type(node) is nltk.Tree:
phrase = " ".join([key.lower() for key, value in node.leaves()])
phrase = unidecode.unidecode(phrase)
if phrase not in stopwords:
pattern = re.compile('([^\s\w-]|_)+')
phrase = pattern.sub('', phrase).strip()
keywords.append(phrase)
return keywords
def main():
from treebank import NegraCorpusReader
from grammar import induce_srcg
from plcfrs import parse, pprint_chart
from containers import Grammar
from nltk import Tree
corpus = NegraCorpusReader(".", "sample2.export", encoding="iso-8859-1")
trees = list(corpus.parsed_sents())
for a in trees: a.chomsky_normal_form(vertMarkov=1, horzMarkov=1)
grammar = Grammar(induce_srcg(trees, corpus.sents()))
trees = [Tree.parse("(ROOT (A (a 0) (b 1)))", parse_leaf=int),
Tree.parse("(ROOT (a 0) (B (c 2) (b 1)))", parse_leaf=int),
Tree.parse("(ROOT (a 0) (B (c 2) (b 1)))", parse_leaf=int),
Tree.parse("(ROOT (C (b 0) (a 1)) (c 2))", parse_leaf=int),
Tree.parse("(ROOT (C (b 0) (a 1)) (c 2))", parse_leaf=int),
]
sents =[["a","b"],
["a","c","b"],
["a","c","b"],
["b","a","c"],
["b","a","a"]]
print "treebank:"
for a in trees: print a
print "\ngrammar:"
grammar = induce_srcg(trees, sents)
for (r,yf),w in sorted(grammar):
print r[0], "-->", " ".join(r[1:]), yf, exp(w)
return [tree]
argmax_split = start + 1
# Find the next largest subspan such that
# the left hand side is a constituent.
for split in range(end - 1, start, -1):
if (start, split) in spans_to_labels:
argmax_split = split
break
left_trees = assemble_subtree(start, argmax_split)
right_trees = assemble_subtree(argmax_split, end)
children = left_trees + right_trees
if labels is not None:
while labels:
children = [Tree(labels.pop(), children)]
return children
def _remove_subtree_from_tree(tree, subtree_to_remove):
for st in tree:
if isinstance(st, nltk.Tree):
if st == subtree_to_remove:
tree.remove(st)
_remove_subtree_from_tree(st, subtree_to_remove)
return
def to_instance(d, tokenize=None):
sentence1 = d['sentence1']
sentence1_parse = d['sentence1_parse']
sentence1_tree = nltk.Tree.fromstring(sentence1_parse)
sentence1_parse_tokens = sentence1_tree.leaves()
sentence1_tokens = tokenize(sentence1) if tokenize else None
sentence2 = d['sentence2']
sentence2_parse = d['sentence2_parse']
sentence2_tree = nltk.Tree.fromstring(sentence2_parse)
sentence2_parse_tokens = sentence2_tree.leaves()
sentence2_tokens = tokenize(sentence2) if tokenize else None
gold_label = d['gold_label']
instance = {
'sentence1': sentence1,
'sentence1_parse': sentence1_parse,
'sentence1_parse_tokens': sentence1_parse_tokens,
'sentence1_tokens': sentence1_tokens,
def demo():
"""
A demonstration showing how Trees and Trees can be
used. This demonstration creates a Tree, and loads a
Tree from the Treebank corpus,
and shows the results of calling several of their methods.
"""
from nltk import Tree, ProbabilisticTree
# Demonstrate tree parsing.
s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))'
t = Tree.fromstring(s)
print("Convert bracketed string into tree:")
print(t)
print(t.__repr__())
print("Display tree properties:")
print(t.label()) # tree's constituent type
print(t[0]) # tree's first child
print(t[1]) # tree's second child
print(t.height())
print(t.leaves())
print(t[1])
print(t[1, 1])
print(t[1, 1, 0])
# Demonstrate tree modification.
the_cat = t[0]