Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_roundtrip_docs_to_json():
text = "I flew to Silicon Valley via London."
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
nlp = English()
doc = nlp(text)
doc.cats = cats
doc[0].is_sent_start = True
for i in range(1, len(doc)):
doc[i].is_sent_start = False
with make_tempdir() as tmpdir:
json_file = tmpdir / "roundtrip.json"
srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(json_file), str(json_file))
reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
assert len(doc) == goldcorpus.count_train()
assert text == reloaded_doc.text
assert "TRAVEL" in goldparse.cats
assert "BAKING" in goldparse.cats
assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
assert cats["BAKING"] == goldparse.cats["BAKING"]
def _collate_best_model(meta, output_path, components):
bests = {}
for component in components:
bests[component] = _find_best(output_path, component)
best_dest = output_path / "model-best"
shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest))
for component, best_component_src in bests.items():
shutil.rmtree(path2str(best_dest / component))
shutil.copytree(
path2str(best_component_src / component), path2str(best_dest / component)
)
accs = srsly.read_json(best_component_src / "accuracy.json")
for metric in _get_metrics(component):
meta["accuracy"][metric] = accs[metric]
srsly.write_json(best_dest / "meta.json", meta)
return best_dest
# Export data to a file
suffix = ".{}".format(file_type)
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
if file_type == "json":
srsly.write_json(output_file, data)
elif file_type == "jsonl":
srsly.write_jsonl(output_file, data)
elif file_type == "msg":
srsly.write_msgpack(output_file, data)
msg.good(
"Generated output file ({} documents): {}".format(len(data), output_file)
)
else:
# Print to stdout
if file_type == "json":
srsly.write_json("-", data)
elif file_type == "jsonl":
srsly.write_jsonl("-", data)
config[key] = str(config[key])
msg = Printer()
util.fix_random_seed(seed)
has_gpu = prefer_gpu()
if has_gpu:
import torch
torch.set_default_tensor_type("torch.cuda.FloatTensor")
msg.info("Using GPU" if has_gpu else "Not using GPU")
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
msg.good("Created output directory")
srsly.write_json(output_dir / "config.json", config)
msg.good("Saved settings to config.json")
# Load texts from file or stdin
if texts_loc != "-": # reading from a file
texts_loc = Path(texts_loc)
if not texts_loc.exists():
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
with msg.loading("Loading input texts..."):
texts = list(srsly.read_jsonl(texts_loc))
if not texts:
msg.fail("Input file is empty", texts_loc, exits=1)
msg.good("Loaded input texts")
random.shuffle(texts)
else: # reading from stdin
msg.text("Reading input text from stdin...")
texts = srsly.read_jsonl("-")
meta["beam_speed"][beam_width] = {
"nwords": nwords,
"cpu": cpu_wps,
"gpu": gpu_wps,
}
meta["vectors"] = {
"width": nlp.vocab.vectors_length,
"vectors": len(nlp.vocab.vectors),
"keys": nlp.vocab.vectors.n_keys,
"name": nlp.vocab.vectors.name,
}
meta.setdefault("name", "model%d" % i)
meta.setdefault("version", version)
meta["labels"] = nlp.meta["labels"]
meta_loc = output_path / ("model%d" % i) / "meta.json"
srsly.write_json(meta_loc, meta)
util.set_env_log(verbose)
progress = _get_progress(
i,
losses,
scorer.scores,
output_stats,
beam_width=beam_width if has_beam_widths else None,
cpu_wps=cpu_wps,
gpu_wps=gpu_wps,
)
if i == 0 and "textcat" in pipeline:
textcats_per_cat = scorer.scores.get("textcats_per_cat", {})
for cat, cat_score in textcats_per_cat.items():
if cat_score.get("roc_auc_score", 0) < 0:
msg.warn(
no_print=True)
all_sents = file_as_json[0]['paragraphs'][0]['sentences']
random.seed(42)
random.shuffle(all_sents)
train_size = round(len(all_sents) * 0.7)
train_sents = all_sents[:train_size]
dev_sents = all_sents[train_size:]
train_json = [{'id': 0, 'paragraphs': [{'sentences': train_sents}]}]
dev_json = [{'id': 0, 'paragraphs': [{'sentences': dev_sents}]}]
srsly.write_json(train_json_path, train_json)
srsly.write_json(dev_json_path, dev_json)
assert os.path.isfile(train_json_path) and os.path.isfile(train_json_path)
return GoldCorpus(Path(train_json_path), Path(dev_json_path))
"cfg": lambda p: srsly.write_json(p, cfg),
}
import srsly
from spacy.cli.converters import conllu2json
from spacy.gold import GoldCorpus
from spacy.gold import Path
for part in ['train', 'dev', 'test']:
conll_path = os.path.join(self.dataset_dir, '{}.{}{}'.format(self.dataset_name, part, self.file_extension))
json_path = os.path.join(self.dataset_dir, "ddt.{}.json".format(part))
if not os.path.isfile(json_path): # Convert the conllu files to json
with open(conll_path, 'r') as file:
file_as_string = file.read()
file_as_string = file_as_string.replace("name=", "").replace("|SpaceAfter", "")
file_as_json = conllu2json(file_as_string)
srsly.write_json(json_path, file_as_json)
train_json_path = os.path.join(self.dataset_dir, "ddt.train.json")
dev_json_path = os.path.join(self.dataset_dir, "ddt.dev.json")
assert os.path.isfile(train_json_path)
assert os.path.isfile(dev_json_path)
return GoldCorpus(Path(train_json_path), Path(dev_json_path))
for name, component in nlp_loaded.pipeline:
if hasattr(component, "cfg"):
component.cfg["beam_width"] = beam_width
dev_docs = list(
corpus.dev_docs(
nlp_loaded,
gold_preproc=gold_preproc,
ignore_misaligned=True,
)
)
start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
end_time = timer()
cpu_wps = nwords / (end_time - start_time)
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
srsly.write_json(acc_loc, scorer.scores)
# Update model meta.json
meta["lang"] = nlp.lang
meta["pipeline"] = nlp.pipe_names
meta["spacy_version"] = ">=%s" % about.__version__
if beam_width == 1:
meta["speed"] = {
"nwords": nwords,
"cpu": cpu_wps,
"gpu": gpu_wps,
}
meta["accuracy"] = scorer.scores
else:
meta.setdefault("beam_accuracy", {})
meta.setdefault("beam_speed", {})
meta["beam_accuracy"][beam_width] = scorer.scores