Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
with make_tempdir() as tmpdir:
out_file = tmpdir / "entity_ruler"
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
new_ruler = EntityRuler(nlp).from_disk(out_file)
for pattern in ruler.patterns:
assert pattern in new_ruler.patterns
assert len(new_ruler) == len(ruler)
assert new_ruler.overwrite is not ruler.overwrite
"patterns": lambda p: srsly.write_jsonl(
p.with_suffix(".jsonl"), self.patterns
),
nlp = spacy.load(spacy_model)
log(f"RECIPE: Loaded spaCy model '{spacy_model}'")
DB = connect()
if dataset not in DB:
msg.fail(f"Can't find dataset '{dataset}'", exits=1)
examples = DB.get_dataset(dataset)
terms = set([eg["word"] for eg in examples if eg["answer"] == "accept"])
if case_sensitive:
patterns = [[{"text": t.text} for t in nlp.make_doc(term)] for term in terms]
else:
terms = set([word.lower() for word in terms])
patterns = [[{"lower": t.lower_} for t in nlp.make_doc(term)] for term in terms]
patterns = [{"label": label, "pattern": pattern} for pattern in patterns]
log(f"RECIPE: Generated {len(patterns)} patterns")
if not dry:
srsly.write_jsonl(output_file, patterns)
return patterns
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
if file_type == "json":
srsly.write_json(output_file, data)
elif file_type == "jsonl":
srsly.write_jsonl(output_file, data)
elif file_type == "msg":
srsly.write_msgpack(output_file, data)
msg.good(
"Generated output file ({} documents): {}".format(len(data), output_file)
)
else:
# Print to stdout
if file_type == "json":
srsly.write_json("-", data)
elif file_type == "jsonl":
srsly.write_jsonl("-", data)
def save_training_examples(self):
model_dir = Path(self.runner.config.model_dir)
if not model_dir.is_dir():
model_dir.mkdir(parents=True, exist_ok=True)
# Write to local then copy (don't thrash virtual file systems like GCS)
_, tmp_file = tempfile.mkstemp()
srsly.write_jsonl(tmp_file, self.all_examples)
out_file = model_dir / INPUT_EXAMPLES_FILE_NAME
copyfile(tmp_file, str(out_file))
os.remove(tmp_file)
return str(out_file)
input_data,
n_sents=n_sents,
seg_sents=seg_sents,
use_morphology=morphology,
lang=lang,
model=model,
no_print=no_print,
)
if output_dir != "-":
# Export data to a file
suffix = ".{}".format(file_type)
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
if file_type == "json":
srsly.write_json(output_file, data)
elif file_type == "jsonl":
srsly.write_jsonl(output_file, data)
elif file_type == "msg":
srsly.write_msgpack(output_file, data)
msg.good(
"Generated output file ({} documents): {}".format(len(data), output_file)
)
else:
# Print to stdout
if file_type == "json":
srsly.write_json("-", data)
elif file_type == "jsonl":
srsly.write_jsonl("-", data)
DOCS: https://spacy.io/api/entityruler#to_disk
"""
path = ensure_path(path)
cfg = {
"overwrite": self.overwrite,
"phrase_matcher_attr": self.phrase_matcher_attr,
"ent_id_sep": self.ent_id_sep,
}
serializers = {
"patterns": lambda p: srsly.write_jsonl(
p.with_suffix(".jsonl"), self.patterns
),
"cfg": lambda p: srsly.write_json(p, cfg),
}
if path.suffix == ".jsonl": # user wants to save only JSONL
srsly.write_jsonl(path, self.patterns)
else:
to_disk(path, serializers, {})
if pattern:
label = f"SKILL|{skill_id}"
patterns.append({"label": label, "pattern": pattern})
for t in split_tokens:
if t in skill_name:
patterns.append(
{
"label": label,
"pattern": self._skill_pattern(
skill_name, t
),
}
)
srsly.write_jsonl(patterns_path, patterns)
return patterns
else:
patterns = srsly.read_jsonl(patterns_path)
return patterns