Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
if freqs_loc is not None:
with msg.loading("Counting frequencies..."):
probs, _ = read_freqs(freqs_loc)
msg.good("Counted frequencies")
else:
probs, _ = ({}, DEFAULT_OOV_PROB) # noqa: F841
if clusters_loc:
with msg.loading("Reading clusters..."):
clusters = read_clusters(clusters_loc)
msg.good("Read clusters")
else:
clusters = {}
lex_attrs = []
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
if len(sorted_probs):
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
attrs = {"orth": word, "id": i, "prob": prob}
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
try:
progress = tqdm.tqdm(total=config["training"]["eval_frequency"], leave=False)
for batch, info, is_best_checkpoint in training_step_iterator:
progress.update(1)
if is_best_checkpoint is not None:
progress.close()
print_row(info)
if is_best_checkpoint and output_path is not None:
nlp.to_disk(output_path)
progress = tqdm.tqdm(total=config["training"]["eval_frequency"], leave=False)
finally:
if output_path is not None:
with nlp.use_params(optimizer.averages):
final_model_path = output_path / "model-final"
nlp.to_disk(final_model_path)
msg.good("Saved model to output directory", final_model_path)
# with msg.loading("Creating best model..."):
)
details = "%s --> %s" % (path2str(model_path), path2str(link_path))
try:
symlink_to(link_path, model_path)
except: # noqa: E722
# This is quite dirty, but just making sure other errors are caught.
msg.fail(
"Couldn't link model to '{}'".format(link_name),
"Creating a symlink in spacy/data failed. Make sure you have the "
"required permissions and try re-running the command as admin, or "
"use a virtualenv. You can still import the model as a module and "
"call its load() method, or create the symlink manually.",
)
msg.text(details)
raise
msg.good("Linking successful", details)
msg.text("You can now load the model via spacy.load('{}')".format(link_name))
# Returns
(str): The subfolder of the output path that contains the pypi package source.
"""
input_path = Path(input_dir)
output_path = Path(output_dir)
if meta_path is not None:
meta_path = Path(meta_path)
if not input_path or not input_path.exists():
msg.fail("Can't locate model data", input_path, exits=1)
if meta_path and not meta_path.exists():
msg.fail("Can't find model model.config.json", meta_path, exits=1)
meta_path = meta_path or input_path / "model.config.json"
if meta_path.is_file():
meta = srsly.read_json(meta_path)
msg.good("Loaded model.config.json from file", meta_path)
meta["mathy_version"] = f">={about.__version__},<1.0.0"
meta["name"] = model_name
for key in REQUIRED_META_KEYS:
if key not in meta or meta[key] == "":
msg.fail(
"No '{}' setting found in model.config.json".format(key),
"This setting is required to build your package.",
exits=1,
)
main_path = output_path / model_name
package_path = main_path
if package_path.exists():
if force:
shutil.rmtree(str(package_path))
else:
msg.fail(
print(cmd)
cooccur_cmd = os.system(cmd)
if cooccur_cmd != 0 or not Path(cooc_file).exists():
msg.fail("Failed creating cooccurrence statistics", exits=1)
msg.good("Created cooccurrence statistics", cooc_file)
msg.info("Shuffling cooccurrence file")
cmd = (
f"{glove_dir}/shuffle -memory {memory} -verbose {verbose} "
f"< {cooc_file} > {cooc_shuffle_file}"
)
print(cmd)
shuffle_cmd = os.system(cmd)
if shuffle_cmd != 0 or not Path(cooc_shuffle_file).exists():
msg.fail("Failed to shuffle cooccurrence file", exits=1)
msg.good("Shuffled cooccurrence file", cooc_shuffle_file)
accepted = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")]
rejected = [eg for eg in data if eg["answer"] == "reject"]
ignored = [eg for eg in data if eg["answer"] == "ignore"]
if not accepted and not rejected:
msg.warn("No annotations collected", exits=1)
total_count = 0
agree_count = 0
for eg in accepted:
total_count += len(eg.get("options", []))
agree_count += len(eg.get("accept", []))
msg.info(f"Evaluating data from '{set_id}'")
msg.text(f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
pc = agree_count / total_count
text = f"You agreed {agree_count} / {total_count} times ({pc:.0%})"
if pc > 0.5:
msg.good(text)
else:
msg.fail(text)
def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
for lex in nlp.vocab:
if lex.rank:
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
else:
if vectors_loc:
with msg.loading("Reading vectors from {}".format(vectors_loc)):
vectors_data, vector_keys = read_vectors(vectors_loc)
msg.good("Loaded vectors from {}".format(vectors_loc))
else:
vectors_data, vector_keys = (None, None)
if vector_keys is not None:
for word in vector_keys:
if word not in nlp.vocab:
lexeme = nlp.vocab[word]
lexeme.is_oov = False
if vectors_data is not None:
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
if name is None:
nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
else:
nlp.vocab.vectors.name = name
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
if prune_vectors >= 1:
nlp.vocab.prune_vectors(prune_vectors)
high_conf = 0.8
agree_count = 0
disagree_high_conf = len([e for e in rejected if e["confidence"] > high_conf])
for eg in accepted:
choice = eg["accept"][0]
score_choice = [o["score"] for o in eg["options"] if o["id"] == choice][0]
score_other = [o["score"] for o in eg["options"] if o["id"] != choice][0]
if score_choice > score_other:
agree_count += 1
elif eg["confidence"] > high_conf:
disagree_high_conf += 1
pc = agree_count / (len(accepted) + len(rejected))
text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})"
msg.info(f"Evaluating data from '{set_id}'")
if pc > 0.5:
msg.good(text)
else:
msg.fail(text)
msg.text(f"You disagreed on {disagree_high_conf} high confidence scores")
msg.text(f"You rejected {len(rejected)} suggestions as not similar")
msg.info("Saving output")
if not isinstance(best_rows, numpy.ndarray):
best_rows = best_rows.get()
if not isinstance(scores, numpy.ndarray):
scores = scores.get()
output = {
"indices": best_rows,
"scores": scores.astype("float16"),
"start": start,
"end": end,
"cutoff": cutoff,
}
output_file = vectors_dir / "cache"
with msg.loading("Saving output..."):
srsly.write_msgpack(output_file, output)
msg.good(f"Saved cache to {output_file}")