Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__)
if r.status_code != 200:
msg.fail(
"Server error ({})".format(r.status_code),
"Couldn't fetch compatibility table.",
exits=1,
)
msg.good("Loaded compatibility table")
compat = r.json()["spacy"]
version = about.__version__
version = version.rsplit(".dev", 1)[0]
current_compat = compat.get(version)
if not current_compat:
msg.fail(
"Can't find spaCy v{} in compatibility table".format(version),
about.__compatibility__,
exits=1,
)
all_models = set()
for spacy_v, models in dict(compat).items():
all_models.update(models.keys())
for model, model_vs in models.items():
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
model_links = get_model_links(current_compat)
model_pkgs = get_model_pkgs(current_compat, all_models)
incompat_links = {l for l, d in model_links.items() if not d["compat"]}
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
incompat_models.update(
[d["name"] for _, d in model_links.items() if not d["compat"]]
)
msg.fail("Can't locate model data", input_path, exits=1)
if not output_path or not output_path.exists():
msg.fail("Output directory not found", output_path, exits=1)
if meta_path and not meta_path.exists():
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta_path = meta_path or input_path / "meta.json"
if meta_path.is_file():
meta = srsly.read_json(meta_path)
if not create_meta: # only print if user doesn't want to overwrite
msg.good("Loaded meta.json from file", meta_path)
else:
meta = generate_meta(input_dir, meta, msg)
for key in ("lang", "name", "version"):
if key not in meta or meta[key] == "":
msg.fail(
"No '{}' setting found in meta.json".format(key),
"This setting is required to build your package.",
exits=1,
)
model_name = meta["lang"] + "_" + meta["name"]
model_name_v = model_name + "-" + meta["version"]
main_path = output_path / model_name_v
package_path = main_path / model_name
if package_path.exists():
if force:
shutil.rmtree(path2str(package_path))
else:
msg.fail(
"Package directory already exists",
"Please delete the directory and try again, or use the "
"""
Evaluate a trained model on Prodigy annotations and print the accuracy.
"""
with msg.loading(f"Loading model '{model}'..."):
nlp = spacy.load(model)
data, _ = format_data(srsly.read_jsonl(eval_path))
sc = nlp.evaluate(data)
result = [("F-Score", f"{sc.textcat_score:.3f}")]
msg.table(result)
if __name__ == "__main__":
opts = {"train": train_model, "evaluate": evaluate_model}
cmd = sys.argv.pop(1)
if cmd not in opts:
msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
try:
plac.call(opts[cmd])
except KeyboardInterrupt:
msg.warn("Stopped.", exits=1)
"The data should be located in {}".format(path2str(model_path)),
exits=1,
)
data_path = util.get_data_path()
if not data_path or not data_path.exists():
spacy_loc = Path(__file__).parent.parent
msg.fail(
"Can't find the spaCy data path to create model symlink",
"Make sure a directory `/data` exists within your spaCy "
"installation and try again. The data directory should be located "
"here:".format(path=spacy_loc),
exits=1,
)
link_path = util.get_data_path() / link_name
if link_path.is_symlink() and not force:
msg.fail(
"Link '{}' already exists".format(link_name),
"To overwrite an existing link, use the --force flag",
exits=1,
)
elif link_path.is_symlink(): # does a symlink exist?
# NB: It's important to check for is_symlink here and not for exists,
# because invalid/outdated symlinks would return False otherwise.
link_path.unlink()
elif link_path.exists(): # does it exist otherwise?
# NB: Check this last because valid symlinks also "exist".
msg.fail(
"Can't overwrite symlink '{}'".format(link_name),
"This can happen if your data directory contains a directory or "
"file of the same name.",
exits=1,
)
package_path = main_path
if package_path.exists():
if force:
shutil.rmtree(str(package_path))
else:
msg.fail(
title="Package directory already exists",
text="Please delete the directory and try again, or use the "
"`--force` flag to overwrite existing directories.",
exits=1,
)
Path.mkdir(package_path, parents=True, exist_ok=True)
for f in REQUIRED_MODEL_FILES:
file_name: Path = input_path / f
if not file_name.exists():
msg.fail(
f"Input path '{input_path}' is missing a required file: '{f}'",
"This file is required to build your package.",
exits=1,
)
shutil.copyfile(file_name, main_path / f)
create_file(output_path / "model.config.json", srsly.json_dumps(meta, indent=2))
create_file(output_path / "setup.py", TEMPLATE_SETUP)
create_file(package_path / "__init__.py", TEMPLATE_INIT)
msg.good("Successfully created package '{}'".format(package_path), main_path)
msg.text("To build the package, run `python setup.py sdist` in this directory.")
return str(package_path)
vectors.
"""
if gpu_id == -1:
xp = numpy
else:
import cupy as xp
import cupy.cuda.device
cupy.take_along_axis = take_along_axis
device = cupy.cuda.device.Device(gpu_id)
device.use()
vectors_dir = Path(vectors)
vectors_file = vectors_dir / "vectors"
if not vectors_dir.is_dir() or not vectors_file.exists():
err = "Are you passing in the exported sense2vec directory containing a vectors file?"
msg.fail(f"Can't load vectors from {vectors}", err, exits=1)
with msg.loading(f"Loading vectors from {vectors}"):
vectors = xp.load(str(vectors_file))
msg.good(f"Loaded {vectors.shape[0]:,} vectors with dimension {vectors.shape[1]}")
norms = xp.linalg.norm(vectors, axis=1, keepdims=True)
norms[norms == 0] = 1
# Normalize to unit norm
vectors /= norms
if cutoff < 1:
cutoff = vectors.shape[0]
if end is None:
end = vectors.shape[0]
mean = float(norms.mean())
var = float(norms.var())
msg.good(f"Normalized (mean {mean:,.2f}, variance {var:,.2f})")
msg.info(f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent")
n = min(n_neighbors, vectors.shape[0])
def get_json(url, desc):
r = requests.get(url)
if r.status_code != 200:
msg.fail(
"Server error ({})".format(r.status_code),
"Couldn't fetch {}. Please find a model for your spaCy "
"installation (v{}), and download it manually. For more "
"details, see the documentation: "
"https://spacy.io/usage/models".format(desc, about.__version__),
exits=1,
)
return r.json()
fasttext_model.save_model(str(output_file))
if not output_file.exists() or not output_file.is_file():
msg.fail("Failed to save fastText model to disk", output_file, exits=1)
msg.good("Successfully saved fastText model to disk", output_file)
else:
fasttext_model = None
msg.fail("Must provide an input directory or fastText binary filepath", exits=1)
msg.info("Creating vocabulary file")
vocab_file = output_path / "vocab.txt"
words, freqs = fasttext_model.get_words(include_freq=True)
with vocab_file.open('w', encoding='utf8') as f:
for i in range(len(words)):
f.write(words[i] + " " + str(freqs[i]) + " word\n")
if not vocab_file.exists() or not vocab_file.is_file():
msg.fail("Failed to create vocabulary", vocab_file, exits=1)
msg.good("Successfully created vocabulary file", vocab_file)
msg.info("Creating vectors file")
vectors_file = output_path / "vectors.txt"
# Adapted from https://github.com/facebookresearch/fastText/blob/master/python/doc/examples/bin_to_vec.py#L31
with vectors_file.open('w', encoding='utf-8') as file_out:
# the first line must contain the number of total words and vector dimension
file_out.write(str(len(words)) + " " + str(fasttext_model.get_dimension()) + '\n')
# line by line, append vector to vectors file
for w in words:
v = fasttext_model.get_word_vector(w)
vstr = ""
for vi in v:
vstr += " " + str(vi)
try:
file_out.write(w + vstr + '\n')
util.fix_random_seed()
util.set_env_log(verbose)
# Make sure all files and paths exists if they are needed
train_path = util.ensure_path(train_path)
dev_path = util.ensure_path(dev_path)
meta_path = util.ensure_path(meta_path)
output_path = util.ensure_path(output_path)
if raw_text is not None:
raw_text = list(srsly.read_jsonl(raw_text))
if not train_path or not train_path.exists():
msg.fail("Training data not found", train_path, exits=1)
if not dev_path or not dev_path.exists():
msg.fail("Development data not found", dev_path, exits=1)
if meta_path is not None and not meta_path.exists():
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path) if meta_path else {}
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
msg.warn(
"Output directory is not empty",
"This can lead to unintended side effects when saving the model. "
"Please use an empty directory or a different path instead. If "
"the specified output path doesn't exist, the directory will be "
"created for you.",
)
if not output_path.exists():
output_path.mkdir()
# Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore.
# Batch size starts at 1 and grows, so that we make updates quickly
# at the beginning of training.