Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _make_tensor_descriptor_array(xs, length):
"""Make an array of pointers denoting pointers of tensor descriptors.
"""
descs = []
batch_size = xs.shape[0] // length
for i in range(length):
x = xs[i*batch_size:(i+1)*batch_size]
if x.ndim < 3:
shape = x.shape + (1,) * (3 - x.ndim)
x = x.reshape(shape)
desc = cudnn.create_tensor_nd_descriptor(x)
descs.append(desc)
return PointerArray([d.value for d in descs], descs)
def rec(node):
attrib = node.attrib
if 'terminal' not in attrib:
cat = Category.parse(attrib['category'])
children = [rec(spans[child]) for child in attrib['child'].split(' ')]
if len(children) == 1:
return Tree.make_unary(cat, children[0], lang)
else:
assert len(children) == 2
left, right = children
combinator = guess_combinator_by_triplet(
binary_rules, cat, left.cat, right.cat)
combinator = combinator or UNKNOWN_COMBINATOR
return Tree.make_binary(cat, left, right, combinator, lang)
else:
cat = Category.parse(attrib['category'])
word = try_get_surface(tokens[attrib['terminal']])
return Tree.make_terminal(word, cat, lang)
def rec(node):
attrib = node.attrib
if 'terminal' not in attrib:
cat = Category.parse(attrib['category'])
children = [rec(spans[child]) for child in attrib['child'].split(' ')]
if len(children) == 1:
return Tree.make_unary(cat, children[0], lang)
else:
assert len(children) == 2
left, right = children
combinator = guess_combinator_by_triplet(
binary_rules, cat, left.cat, right.cat)
combinator = combinator or UNKNOWN_COMBINATOR
return Tree.make_binary(cat, left, right, combinator, lang)
else:
cat = Category.parse(attrib['category'])
word = try_get_surface(tokens[attrib['terminal']])
return Tree.make_terminal(word, cat, lang)
def text_to_instance(self,
sentence: str,
tags: List[str] = None,
deps: List[int] = None,
weight: float = 1.0) -> Instance: # type: ignore
# pylint: disable=arguments-differ
tokens = [Token(utils.normalize(token)) for token in sentence.split(' ')]
token_field = TextField(tokens, self._token_indexers)
metadata = MetadataField({'words': sentence})
weight = ArrayField(numpy.array([weight], 'f'))
fields = {
'words': token_field,
'metadata': metadata,
'weight': weight,
}
if tags is not None and deps is not None:
fields['head_tags'] = SequenceLabelField(
tags, token_field, label_namespace='head_tags')
fields['head_indices'] = SequenceLabelField(
deps, token_field, label_namespace='head_indices')
return Instance(fields)
try:
lines = open(file)
except IOError as e:
die(f'could not open gold_deps file ({e.strerror})')
deps, udeps = set(), set()
for line in lines:
line = line.strip()
if line.startswith('<s>'):
yield deps, udeps
deps, udeps = set(), set()
continue
arg_index, pred_index, cat, slot, arg, pred = line.split()[:6]
pred = f'{utils.normalize(pred)}_{int(pred_index) + 1}'
arg = f'{utils.normalize(arg)}_{int(arg_index) + 1}'
deps.add((pred, cat, slot, arg))
udeps.add((pred, arg))
assert len(deps) == 0 and len(udeps) == 0
</s>
lines = open(file)
except IOError as e:
die(f'could not open gold_deps file ({e.strerror})')
deps, udeps = set(), set()
for line in lines:
line = line.strip()
if line.startswith('<s>'):
yield deps, udeps
deps, udeps = set(), set()
continue
arg_index, pred_index, cat, slot, arg, pred = line.split()[:6]
pred = f'{utils.normalize(pred)}_{int(pred_index) + 1}'
arg = f'{utils.normalize(arg)}_{int(arg_index) + 1}'
deps.add((pred, cat, slot, arg))
udeps.add((pred, arg))
assert len(deps) == 0 and len(udeps) == 0
</s>
elif not sys.stdin.isatty():
input_type = sys.stdin
else:
# reading from keyboard
input_type = None
sys.stdout.flush()
sys.stderr.flush()
logging.getLogger().setLevel(logging.CRITICAL)
while True:
fin = [line for line in ([input()] if input_type is None else input_type) if len(line.strip()) > 0]
if len(fin) == 0:
break
if args.input_format == 'POSandNERtagged':
tagged_doc = [[Token.from_piped(token) for token in sent.strip().split(' ')] for sent in fin]
doc = [' '.join(token.word for token in sent) for sent in tagged_doc]
res = parser.parse_doc(doc,
probs=probs,
tag_list=tag_list,
batchsize=args.batchsize)
elif args.input_format == 'json':
doc = [json.loads(line) for line in fin]
tagged_doc = annotate_fun(
[[word for word in sent['words'].split(' ')] for sent in doc])
res = parser.parse_json(doc)
elif args.input_format == 'partial':
doc, constraints = zip(*[read_partial_tree(l.strip()) for l in fin])
tagged_doc = annotate_fun(doc)
res = parser.parse_doc(doc,
probs=probs,
tag_list=tag_list,
def from_piped(cls, string: str) -> 'Token':
# WORD|POS|NER or WORD|LEMMA|POS|NER
# or WORD|LEMMA|POS|NER|CHUCK
items = string.split('|')
if len(items) == 5:
word, lemma, pos, entity, chunk = items
elif len(items) == 4:
word, lemma, pos, entity = items
chunk = 'XX'
else:
assert len(items) == 3
word, pos, entity = items
lemma = 'XX'
chunk = 'XX'
return Token(word=word,
lemma=lemma,
pos=pos,
entity=entity,
chunk=chunk)
from janome.tokenizer import Tokenizer
except ImportError:
logger.error('failed to import janome. please install it by "pip install janome".')
exit(1)
logger.info('use Janome to tokenize and annotate POS infos.')
tokenizer = Tokenizer()
res = []
raw_sentences = []
for sentence in sentences:
sentence = ''.join(sentence)
tokenized = tokenizer.tokenize(sentence)
tokens = []
for token in tokenized:
pos, pos1, pos2, pos3 = token.part_of_speech.split(',')
token = Token(word=token.surface,
surf=token.surface,
pos=pos,
pos1=pos1,
pos2=pos2,
pos3=pos3,
inflectionForm=token.infl_form,
inflectionType=token.infl_type,
reading=token.reading,
base=token.base_form)
tokens.append(token)
raw_sentence = [token.surface for token in tokenized]
res.append(tokens)
raw_sentences.append(raw_sentence)
return res, raw_sentences
res, error = proc.communicate()
try:
tagged_sentences = res.decode('utf-8').strip().split('\n')
tagged_sentences = [[tuple(token.split('|')) for token in sentence.strip().split(' ')]
for sentence in tagged_sentences]
except:
raise RuntimeError('failed to process C&C output. there might have been some problem '
'during running C&C pipeline?\n'
f'stderr:\n {error}')
res = []
for sentence in tagged_sentences:
words, poss = zip(*[(word, pos) for word, pos, _ in sentence])
lemmas = stemmer.analyze(list(words), list(poss))
tokens = [Token(word=word, pos=pos, entity=ner, lemma=lemma.lower(), chunk='XX')
for (word, pos, ner), lemma in zip(sentence, lemmas)]
res.append(tokens)
return res