Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
elif not sys.stdin.isatty():
input_type = sys.stdin
else:
# reading from keyboard
input_type = None
sys.stdout.flush()
sys.stderr.flush()
logging.getLogger().setLevel(logging.CRITICAL)
while True:
fin = [line for line in ([input()] if input_type is None else input_type) if len(line.strip()) > 0]
if len(fin) == 0:
break
if args.input_format == 'POSandNERtagged':
tagged_doc = [[Token.from_piped(token) for token in sent.strip().split(' ')] for sent in fin]
doc = [' '.join(token.word for token in sent) for sent in tagged_doc]
res = parser.parse_doc(doc,
probs=probs,
tag_list=tag_list,
batchsize=args.batchsize)
elif args.input_format == 'json':
doc = [json.loads(line) for line in fin]
tagged_doc = annotate_fun(
[[word for word in sent['words'].split(' ')] for sent in doc])
res = parser.parse_json(doc)
elif args.input_format == 'partial':
doc, constraints = zip(*[read_partial_tree(l.strip()) for l in fin])
tagged_doc = annotate_fun(doc)
res = parser.parse_doc(doc,
probs=probs,
tag_list=tag_list,
def from_piped(cls, string: str) -> 'Token':
# WORD|POS|NER or WORD|LEMMA|POS|NER
# or WORD|LEMMA|POS|NER|CHUCK
items = string.split('|')
if len(items) == 5:
word, lemma, pos, entity, chunk = items
elif len(items) == 4:
word, lemma, pos, entity = items
chunk = 'XX'
else:
assert len(items) == 3
word, pos, entity = items
lemma = 'XX'
chunk = 'XX'
return Token(word=word,
lemma=lemma,
pos=pos,
entity=entity,
chunk=chunk)
from janome.tokenizer import Tokenizer
except ImportError:
logger.error('failed to import janome. please install it by "pip install janome".')
exit(1)
logger.info('use Janome to tokenize and annotate POS infos.')
tokenizer = Tokenizer()
res = []
raw_sentences = []
for sentence in sentences:
sentence = ''.join(sentence)
tokenized = tokenizer.tokenize(sentence)
tokens = []
for token in tokenized:
pos, pos1, pos2, pos3 = token.part_of_speech.split(',')
token = Token(word=token.surface,
surf=token.surface,
pos=pos,
pos1=pos1,
pos2=pos2,
pos3=pos3,
inflectionForm=token.infl_form,
inflectionType=token.infl_type,
reading=token.reading,
base=token.base_form)
tokens.append(token)
raw_sentence = [token.surface for token in tokenized]
res.append(tokens)
raw_sentences.append(raw_sentence)
return res, raw_sentences
res, error = proc.communicate()
try:
tagged_sentences = res.decode('utf-8').strip().split('\n')
tagged_sentences = [[tuple(token.split('|')) for token in sentence.strip().split(' ')]
for sentence in tagged_sentences]
except:
raise RuntimeError('failed to process C&C output. there might have been some problem '
'during running C&C pipeline?\n'
f'stderr:\n {error}')
res = []
for sentence in tagged_sentences:
words, poss = zip(*[(word, pos) for word, pos, _ in sentence])
lemmas = stemmer.analyze(list(words), list(poss))
tokens = [Token(word=word, pos=pos, entity=ner, lemma=lemma.lower(), chunk='XX')
for (word, pos, ner), lemma in zip(sentence, lemmas)]
res.append(tokens)
return res
def reduce(item: str) -> None:
nonlocal position
if item[-1] != ')':
token = Token(word=item)
tokens.append(token)
stack.append(item)
return
reduce(item[:-1])
if isinstance(stack[-1], str):
word = stack.pop()
category = stack.pop()
tree = Tree.make_terminal(word, category, lang)
position += 1
else:
assert isinstance(stack[-1], Tree)
children = []
while isinstance(stack[-1], Tree):
tree = stack.pop()
children.append(tree)
cat = Category.parse(attrib['cat'])
children = [rec(child) for child in node.getchildren()]
if len(children) == 1:
return Tree.make_unary(cat, children[0], lang)
else:
assert len(children) == 2
left, right = children
combinator = guess_combinator_by_triplet(
binary_rules, cat, left.cat, right.cat)
combinator = combinator or UNKNOWN_COMBINATOR
return Tree.make_binary(cat, left, right, combinator, lang)
else:
assert node.tag == 'lf'
cat = Category.parse(attrib['cat'])
word = attrib['word']
token = Token(word=attrib['word'],
pos=attrib['pos'],
entity=attrib['entity'],
lemma=attrib['lemma'],
chunk=attrib['chunk'])
tokens.append(token)
return Tree.make_terminal(word, cat, lang)
tokens = []
res = []
for sentence in docs:
tokens = []
for token in sentence:
if token.ent_iob_ == 'O':
ner = token.ent_iob_
else:
ner = token.ent_iob_ + '-' + token.ent_type_
# takes care of pronoun
if token.lemma_ == '-PRON-':
lemma = str(token).lower()
else:
lemma = token.lemma_.lower()
tokens.append(
Token(word=str(token),
pos=token.tag_,
entity=ner,
lemma=lemma,
chunk='XX'))
res.append(tokens)
if tokenize:
return res, raw_sentences
else:
return res
return Tree.make_terminal(word, cat, lang)
spans = {span.attrib['id']: span for span in tree.xpath('./span')}
return rec(spans[tree.attrib['root']])
trees = etree.parse(filename).getroot()
sentences = trees[0][0].xpath('sentence')
for sentence in sentences:
token_and_ids = []
for token in sentence.xpath('.//token'):
token_attribs = dict(token.attrib)
token_id = token_attribs['id']
for no_need in ['id', 'start', 'cat']:
if no_need in token_attribs:
del token_attribs[no_need]
token_and_ids.append((token_id, Token(**token_attribs)))
tokens = [token for _, token in token_and_ids]
for ccg in sentence.xpath('./ccg'):
tree = parse(ccg, dict(token_and_ids))
yield ccg.attrib['id'], tokens, tree