Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_parse_CoNLL2009_1(self):
data = dedent("""\
#\tid\tform\tlemma\tplemma\tpos\tppos\tfeats\tpfeats\thead\tphead\tdeprel\tpdeprel\tfillpred\tpred\tapreds
1\tZ\tz\tz\tR\tR\tSubPOS=R|Cas=2\tSubPOS=R|Cas=2\t10\t10\tAuxP\tAuxP\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_
2\ttéto\ttento\ttento\tP\tP\tSubPOS=D|Gen=F|Num=S|Cas=2\tSubPOS=D|Gen=F|Num=S|Cas=2\t3\t3\tAtr\tAtr\tY\ttento\t_\tRSTR\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_
3\tknihy\tkniha\tkniha\tN\tN\tSubPOS=N|Gen=F|Num=S|Cas=2|Neg=A\tSubPOS=N|Gen=F|Num=S|Cas=2|Neg=A\t1\t1\tAdv\tAdv\tY\tkniha\t_\t_\t_\t_\t_\t_\t_\tDIR1\t_\t_\t_\t_\t_\t_\t_\t_
""")
sentences = parse(
data,
fields=(
'id', 'form', 'lemma', 'plemma', 'pos', 'ppos', 'feats', 'pfeats',
'head', 'phead', 'deprel', 'pdeprel', 'fillpred', 'pred', 'apreds'
),
field_parsers={
"pfeats": lambda line, i: parse_dict_value(line[i]),
"phead": lambda line, i: parse_int_value(line[i]),
"apreds": lambda line, i: [
apred_field if apred_field != "_" else None
for apred_field in line[i:len(line)]
],
},
)
self.assertEqual(
sentences[0][2],
def read_lemmatization_data(path):
with open(path) as f:
df = pd.DataFrame(tok for sent in tqdm(conllu.parse(f.read())) for tok in sent)
X = [(word_class, full_form) for _, (word_class, full_form) in df[["upostag", "form"]].iterrows()]
y = [lemma for _, (lemma,) in df[["lemma"]].iterrows()]
return X, y
def benchmark_model(model_name, test_data_path, ner_test_data):
with open(test_data_path) as f:
data = conllu.parse(f.read())
text = " ".join(d.metadata["text"] for d in data)
load_model = getattr(importlib.import_module(model_name), "load")
nlp = load_model()
_parsed = StringIO(format_as_conllu(nlp(text), 1))
parsed = conll17_ud_eval.load_conllu(_parsed)
gold = conll17_ud_eval.load_conllu_file(test_data_path)
results = pd.DataFrame(
{k: v.__dict__ for k, v in conll17_ud_eval.evaluate(gold, parsed).items()}
).T
print(results)
diterator = DataIterator()
def convert_szk_to_conllu(from_glob, to_path, dev_path, test_path):
ignored = []
for fpath in [dev_path, test_path]:
with open(fpath) as f:
ignored.extend(map(sentence_repr, conllu.parse(f.read())))
ignored = set(ignored)
parsed = []
for fpath in glob.glob(from_glob):
for sent in conllu.parse("\n\n".join(parse_szk(fpath))):
if sentence_repr(sent) not in ignored:
parsed.append(sent)
print(len(parsed))
with open(to_path, "w") as outf:
out = "".join(sent.serialize() for sent in parsed)
outf.write(out)
def convert_szk_to_conllu(from_glob, to_path, dev_path, test_path):
ignored = []
for fpath in [dev_path, test_path]:
with open(fpath) as f:
ignored.extend(map(sentence_repr, conllu.parse(f.read())))
ignored = set(ignored)
parsed = []
for fpath in glob.glob(from_glob):
for sent in conllu.parse("\n\n".join(parse_szk(fpath))):
if sentence_repr(sent) not in ignored:
parsed.append(sent)
print(len(parsed))
with open(to_path, "w") as outf:
out = "".join(sent.serialize() for sent in parsed)
outf.write(out)
def convert_szk_to_conllu(from_glob, to_path, dev_path, test_path, morph):
ignored = []
for fpath in [dev_path, test_path]:
with open(fpath) as f:
ignored.extend(map(sentence_repr, conllu.parse(f.read())))
parser = parse_szk_morph if morph else parse_szk_dep
ignored = set(ignored)
parsed = []
for fpath in glob.glob(from_glob):
for sent in conllu.parse("\n\n".join(parser(fpath))):
if sentence_repr(sent) not in ignored:
parsed.append(sent)
logging.info("Read {} sentences".format(len(parsed)))
with open(to_path, "w") as outf:
out = "".join(sent.serialize() for sent in parsed)
outf.write(out)
parser.add_argument('--pos-set',dest="posset",type=str,help="URL of the set definition for *language-specific* part-of-speech and features (xpos and not the universal pos!)", action='store',default="undefined",required=False)
parser.add_argument('--dependency-set',dest="depset", type=str,help="Dependency set", action='store',default=UDEP_SET, required=False)
parser.add_argument('-o', '--outputdir',type=str,help="Output directory", action='store',default=".", required=False)
parser.add_argument('files', nargs='+', help='CONLL-U input files')
args = parser.parse_args()
for file in args.files:
if args.id:
doc_id = args.id
else:
doc_id = os.path.basename(file)
doc = None
hascontent = False
with open(file,'r',encoding='utf-8') as f:
sentences = conllu.parse(f.read())
for i, tokenlist in enumerate(sentences):
if 'newdoc id' in tokenlist.metadata or i == 0:
if doc is not None and hascontent:
doc.save(os.path.join(args.outputdir, doc_id + ".folia.xml"))
print("Wrote " + doc_id + ".folia.xml",file=sys.stderr)
if 'newdoc id' in tokenlist.metadata:
doc_id = tokenlist.metadata['newdoc id']
hascontent = False
doc = folia.Document(id=doc_id)
doc.declare(folia.PosAnnotation, set=UPOS_SET, annotator="conll2folia")
doc.declare(folia.PosAnnotation, set=args.posset, annotator="conll2folia")
doc.declare(folia.Dependency, set=args.depset, annotator="conll2folia")
doc.declare(folia.LemmaAnnotation, set=args.lemmaset, annotator="conll2folia")
textbody = folia.Text(doc, id=doc_id+'.text')
doc.append(textbody)
anchor = textbody