Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _train_parser(parser):
fix_random_seed(1)
parser.add_label("left")
parser.begin_training([], **parser.cfg)
sgd = Adam(NumpyOps(), 0.001)
for i in range(5):
losses = {}
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
parser.update([doc], [gold], sgd=sgd, losses=losses)
return parser
def get_golds(self, model, force=False):
if len(self.golds) > 0 and not force:
return self.golds
self.golds = []
for text, annotation in self.label_data:
doc = model.tokenizer(text)
gold = GoldParse(doc, entities=annotation["entities"])
self.golds.append(gold)
return self.golds
def convert_unknown_bilou(doc: Doc, offsets: List[Offset]) -> GoldParse:
"""
Convert entity offsets to list of BILOU annotations
and convert UNKNOWN label to Spacy missing values
https://spacy.io/api/goldparse#biluo_tags_from_offsets
:param doc: spacy tokenized text
:param offsets: discovered offsets
:return: tuple of docs and BILOU annotations
"""
tupple_offset = [offset.to_tuple() for offset in offsets]
bilou_annotations = convert_bilou_with_missing_action(doc=doc, offsets=tupple_offset)
return GoldParse(doc, entities=bilou_annotations)
if "-" in id_:
continue
id_ = int(id_) - 1
head = int(head) - 1 if head != "0" else id_
sent["words"].append(word)
sent["tags"].append(tag)
sent["morphology"].append(_parse_morph_string(morph))
sent["morphology"][-1].add("POS_%s" % pos)
sent["heads"].append(head)
sent["deps"].append("ROOT" if dep == "root" else dep)
sent["spaces"].append(space_after == "_")
sent["entities"] = ["-"] * len(sent["words"])
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
if oracle_segments:
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
golds.append(GoldParse(docs[-1], **sent))
assert golds[-1].morphology is not None
sent_annots.append(sent)
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
doc, gold = _make_gold(nlp, None, sent_annots)
assert gold.morphology is not None
sent_annots = []
docs.append(doc)
golds.append(gold)
if limit and len(docs) >= limit:
return docs, golds
if raw_text and sent_annots:
doc, gold = _make_gold(nlp, None, sent_annots)
docs.append(doc)
golds.append(gold)
parsed_sentences = []
gold_sentences = []
with open(opencorpora_file, "r") as f:
opencorpora = f.read().encode('utf-8')
page_tree = html.fromstring(opencorpora)
for text in page_tree.xpath('//text'):
for paragraphs in text.xpath('./paragraphs'):
for paragraph in paragraphs.xpath('./paragraph'):
for sentence in paragraph.xpath('./sentence'):
text = sentence.xpath('./source')[0].text
parsed_sentences.append(nlp(text))
sent_words = [token.attrib['text'] for token in sentence.xpath('./tokens/token')]
gold = GoldParse(Doc(nlp.vocab, words=sent_words), words=sent_words, # heads=sent_heads,
# tags=sent_tags, deps=sent_deps,
entities=['-'] * len(sent_words))
gold_sentences.append(gold)
return parsed_sentences, gold_sentences
continue
if '-' in id_:
continue
id_ = int(id_)-1
head = int(head)-1 if head != '0' else id_
sent['words'].append(word)
sent['tags'].append(tag)
sent['heads'].append(head)
sent['deps'].append('ROOT' if dep == 'root' else dep)
sent['spaces'].append(space_after == '_')
sent['entities'] = ['-'] * len(sent['words'])
sent['heads'], sent['deps'] = projectivize(sent['heads'],
sent['deps'])
if oracle_segments:
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
golds.append(GoldParse(docs[-1], **sent))
sent_annots.append(sent)
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
doc, gold = _make_gold(nlp, None, sent_annots)
sent_annots = []
docs.append(doc)
golds.append(gold)
if limit and len(docs) >= limit:
return docs, golds
if raw_text and sent_annots:
doc, gold = _make_gold(nlp, None, sent_annots)
docs.append(doc)
golds.append(gold)
if limit and len(docs) >= limit:
return docs, golds
def evaluate(nlp, gold_tuples, gold_preproc=True):
scorer = Scorer()
for raw_text, sents in gold_tuples:
for annot_tuples, brackets in sents:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.parser(tokens)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold)
return scorer
sent_starts = []
for sent in sent_annots:
flat["heads"].extend(len(flat["words"])+head for head in sent["heads"])
for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
flat[field].extend(sent[field])
sent_starts.append(True)
sent_starts.extend([False] * (len(sent["words"]) - 1))
# Construct text if necessary
assert len(flat["words"]) == len(flat["spaces"])
if text is None:
text = "".join(
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
)
doc = nlp.make_doc(text)
flat.pop("spaces")
gold = GoldParse(doc, **flat)
gold.sent_starts = sent_starts
for i in range(len(gold.heads)):
if random.random() < drop_deps:
gold.heads[i] = None
gold.labels[i] = None
return doc, gold
# Flatten the conll annotations, and adjust the head indices
flat = defaultdict(list)
sent_starts = []
for sent in sent_annots:
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
flat[field].extend(sent[field])
sent_starts.append(True)
sent_starts.extend([False] * (len(sent['words'])-1))
# Construct text if necessary
assert len(flat['words']) == len(flat['spaces'])
if text is None:
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))
doc = nlp.make_doc(text)
flat.pop('spaces')
gold = GoldParse(doc, **flat)
gold.sent_starts = sent_starts
for i in range(len(gold.heads)):
if random.random() < drop_deps:
gold.heads[i] = None
gold.labels[i] = None
return doc, gold
continue
id_ = int(id_) - 1
try:
head = int(head) - 1 if head != '0' else id_
except ValueError:
head = id_
sent_words.append(word)
sent_tags.append(tag)
sent_heads.append(head)
sent_deps.append('ROOT' if dep == 'root' else dep)
sent_heads, sent_deps = projectivize(sent_heads, sent_deps)
# text should be cleaned, because removing trailing spaces is not point of spaCy at all
# and should not be evaluated
text = re.sub('\s+', ' ', text).strip()
parsed_sentences.append(nlp(text))
gold = GoldParse(Doc(nlp.vocab, words=sent_words), words=sent_words, heads=sent_heads,
tags=sent_tags, deps=sent_deps,
entities=['-'] * len(sent_words))
gold_sentences.append(gold)
documents[docid].append(text)
documents_gold_sentences[docid].append(gold)
gold_segmentation[docid].append([1] + [0] * (len(sent_words) - 1))
return parsed_sentences, gold_sentences, gold_segmentation, documents, documents_gold_sentences