Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_parse_incr_invalid_file(self):
with self.assertRaises(FileNotFoundError):
list(parse_incr("SOME STRING DATA"))
def convert(input, output):
print ("Parsing file '{}'.".format(input))
if os.path.exists(output):
os.remove(output)
data_file = open(input, "r", encoding="utf-8")
f = open(output, "a")
f.write("## intent:ner_examples")
f.write("\n")
f.close()
for tokenlist in parse_incr(data_file, fields=CONLL_FILEDS):
tokens = []
entity = None
found_entity = False
for token in tokenlist:
if "entity" not in token:
token["entity"] = "O"
# new entity found
if token["entity"].startswith("B-") and not found_entity:
tokens.append("[{}".format(token["form"]))
found_entity = True
entity = token["entity"][2:]
# new entity directly after another entity
def _read(self, file_path: str):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
with open(file_path, "r") as conllu_file:
logger.info("Reading UD instances from conllu dataset at: %s", file_path)
for annotation in parse_incr(conllu_file):
# CoNLLU annotations sometimes add back in words that have been elided
# in the original sentence; we remove these, as we're just predicting
# dependencies for the original sentence.
# We filter by None here as elided words have a non-integer word id,
# and are replaced with None by the conllu python library.
annotation = [x for x in annotation if isinstance(x["id"], int)]
heads = [x["head"] for x in annotation]
tags = [x["deprel"] for x in annotation]
words = [x["form"] for x in annotation]
if self.use_language_specific_pos:
pos_tags = [x["xpostag"] for x in annotation]
else:
pos_tags = [x["upostag"] for x in annotation]
yield self.text_to_instance(words, pos_tags, list(zip(tags, heads)))
def parse(self, file):
data = []
file = io.TextIOWrapper(file, encoding='utf-8')
# Add check exception
field_parsers = {
"ne": lambda line, i: conllu.parser.parse_nullable_value(line[i]),
}
gen_parser = conllu.parse_incr(
file,
fields=("form", "ne"),
field_parsers=field_parsers
)
try:
for sentence in gen_parser:
if not sentence:
continue
if len(data) >= settings.IMPORT_BATCH_SIZE:
yield data
data = []
words, labels = [], []
for item in sentence:
word = item.get("form")
tag = item.get("ne")
ambiguous=False
)
syntax = Layer(name=syntax_layer,
text_object=text,
attributes=['id', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc'],
ambiguous=False
)
cur = 0
t = []
sentence_start = 0
with open(file, "r", encoding="utf-8") as data_file:
for sentence in parse_incr(data_file):
for w in sentence:
token = w['form']
t.append(token)
len_w = len(token)
base_span = ElementaryBaseSpan(cur, cur+len_w)
words.add_annotation(base_span)
syntax.add_annotation(base_span, **w)
cur += len_w + 1
sentences.add_annotation(words[sentence_start:])
sentence_start += len(sentence)
text.text = ' '.join(t)
text.add_layer(words)
text.add_layer(sentences)
Returns results in a tuple.
'''
j = -1
for i in range( len(sent_id_str)-1, -1, -1 ):
if sent_id_str[i] == '_':
j = i
break
return (sent_id_str[:j], sent_id_str[j:]) if j != -1 else (sent_id_str, '')
cur = 0
t = []
sentence_start = 0
last_fname = None
last_sent_id = '##start##'
with open(file, "r", encoding="utf-8") as data_file:
for sentence in parse_incr(data_file):
cur_sent_id = sentence.metadata.get('sent_id', None)
if not last_sent_id == '##start##':
# Determine if we need to create a new document
if isinstance(last_sent_id, str) and isinstance(cur_sent_id, str):
# Separate fname from the sentence counter
last_fname, _ = _split_into_fname_and_counter( last_sent_id )
cur_fname, _ = _split_into_fname_and_counter( cur_sent_id )
if postcorrect_sent_ids:
# Manually correct some broken file names
# (remove redundant letter 'n' from the start)
if last_fname in broken_fnames:
last_fname = last_fname[1:]
if cur_fname in broken_fnames:
cur_fname = cur_fname[1:]
if last_fname != cur_fname:
# New document needs to be created
def _read_one_file(self, lang: str, file_path: str):
with open(file_path, "r") as conllu_file:
logger.info(
"Reading UD instances for %s language from conllu dataset at: %s", lang, file_path
)
for annotation in parse_incr(conllu_file):
# CoNLLU annotations sometimes add back in words that have been elided
# in the original sentence; we remove these, as we're just predicting
# dependencies for the original sentence.
# We filter by None here as elided words have a non-integer word id,
# and are replaced with None by the conllu python library.
annotation = [x for x in annotation if x["id"] is not None]
heads = [x["head"] for x in annotation]
tags = [x["deprel"] for x in annotation]
words = [x["form"] for x in annotation]
if self._use_language_specific_pos:
pos_tags = [x["xpostag"] for x in annotation]
else:
pos_tags = [x["upostag"] for x in annotation]
yield self.text_to_instance(lang, words, pos_tags, list(zip(tags, heads)))
if word_to_id_dict is None:
word_to_id = defaultdict(lambda: len(word_to_id))
else:
word_to_id = word_to_id_dict
text = []
tags = []
trees = []
heads = []
right_num_deps = []
left_num_deps = []
deps = []
fin = open(fname, "r", encoding="utf-8")
fin_tree = open(fname, "r", encoding="utf-8")
data_file_tree = parse_tree_incr(fin_tree)
data_file = parse_incr(fin)
for sent, tree in zip(data_file, data_file_tree):
sent_list = []
tag_list = []
head_list = []
right_num_deps_ = []
left_num_deps_ = []
sent_n = []
deps_list = []
# delete multi-word token
for token in sent:
if isinstance(token["id"], int):
sent_n += [token]
for token in sent_n:
sent_list.append(word_to_id[token["form"]])