Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
rows = []
for name, data in model_pkgs.items():
rows.append(get_model_row(current_compat, name, data, msg))
for name, data in model_links.items():
rows.append(get_model_row(current_compat, name, data, msg, "link"))
msg.table(rows, header=header)
else:
msg.text("No models found in your current environment.", exits=0)
if update_models:
msg.divider("Install updates")
msg.text("Use the following commands to update the model packages:")
cmd = "python -m spacy download {}"
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
if na_models:
msg.text(
"The following models are not available for spaCy "
"v{}: {}".format(about.__version__, ", ".join(na_models))
)
if incompat_links:
msg.text(
"You may also want to overwrite the incompatible links using the "
"`python -m spacy link` command with `--force`, or remove them "
"from the data directory. "
"Data path: {path}".format(path=path2str(get_data_path()))
)
if incompat_models or incompat_links:
sys.exit(1)
msg.divider("Installed models (spaCy v{})".format(about.__version__))
msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
if model_links or model_pkgs:
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
rows = []
for name, data in model_pkgs.items():
rows.append(get_model_row(current_compat, name, data, msg))
for name, data in model_links.items():
rows.append(get_model_row(current_compat, name, data, msg, "link"))
msg.table(rows, header=header)
else:
msg.text("No models found in your current environment.", exits=0)
if update_models:
msg.divider("Install updates")
msg.text("Use the following commands to update the model packages:")
cmd = "python -m spacy download {}"
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
if na_models:
msg.text(
"The following models are not available for spaCy "
"v{}: {}".format(about.__version__, ", ".join(na_models))
)
if incompat_links:
msg.text(
"You may also want to overwrite the incompatible links using the "
"`python -m spacy link` command with `--force`, or remove them "
"from the data directory. "
"Data path: {path}".format(path=path2str(get_data_path()))
)
if incompat_models or incompat_links:
sys.exit(1)
def eval_dataset(set_id):
DB = connect()
data = DB.get_dataset(set_id)
accepted = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")]
rejected = [eg for eg in data if eg["answer"] == "reject"]
ignored = [eg for eg in data if eg["answer"] == "ignore"]
if not accepted and not rejected:
msg.warn("No annotations collected", exits=1)
total_count = 0
agree_count = 0
for eg in accepted:
total_count += len(eg.get("options", []))
agree_count += len(eg.get("accept", []))
msg.info(f"Evaluating data from '{set_id}'")
msg.text(f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
pc = agree_count / total_count
text = f"You agreed {agree_count} / {total_count} times ({pc:.0%})"
if pc > 0.5:
msg.good(text)
else:
msg.fail(text)
if not accepted and not rejected:
msg.warn("No annotations collected", exits=1)
counts = Counter()
for eg in accepted:
for model_id in eg["accept"]:
counts[model_id] += 1
preference, _ = counts.most_common(1)[0]
ratio = f"{counts[preference]} / {sum(counts.values()) - counts[preference]}"
msg.info(f"Evaluating data from '{set_id}'")
msg.text(f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
if counts["A"] == counts["B"]:
msg.warn(f"No preference ({ratio})")
else:
pc = counts[preference] / sum(counts.values())
msg.good(f"You preferred vectors {preference} with {ratio} ({pc:.0%})")
msg.text(mapping[preference])
choice = eg["accept"][0]
score_choice = [o["score"] for o in eg["options"] if o["id"] == choice][0]
score_other = [o["score"] for o in eg["options"] if o["id"] != choice][0]
if score_choice > score_other:
agree_count += 1
elif eg["confidence"] > high_conf:
disagree_high_conf += 1
pc = agree_count / (len(accepted) + len(rejected))
text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})"
msg.info(f"Evaluating data from '{set_id}'")
if pc > 0.5:
msg.good(text)
else:
msg.fail(text)
msg.text(f"You disagreed on {disagree_high_conf} high confidence scores")
msg.text(f"You rejected {len(rejected)} suggestions as not similar")
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
n_train_words = corpus.count_train()
if base_model:
# Start with an existing model, use default optimizer
optimizer = create_default_optimizer(Model.ops)
else:
# Start with a blank model, call begin_training
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
nlp._optimizer = None
# Load in pretrained weights
if init_tok2vec is not None:
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
msg.text("Loaded pretrained tok2vec for: {}".format(components))
# Verify textcat config
if "textcat" in pipeline:
textcat_labels = nlp.get_pipe("textcat").cfg["labels"]
if textcat_positive_label and textcat_positive_label not in textcat_labels:
msg.fail(
"The textcat_positive_label (tpl) '{}' does not match any "
"label in the training data.".format(textcat_positive_label),
exits=1,
)
if textcat_positive_label and len(textcat_labels) != 2:
msg.fail(
"A textcat_positive_label (tpl) '{}' was provided for training "
"data that does not appear to be a binary classification "
"problem with two labels.".format(textcat_positive_label),
exits=1,
)
Path.mkdir(package_path, parents=True, exist_ok=True)
for f in REQUIRED_MODEL_FILES:
file_name: Path = input_path / f
if not file_name.exists():
msg.fail(
f"Input path '{input_path}' is missing a required file: '{f}'",
"This file is required to build your package.",
exits=1,
)
shutil.copyfile(file_name, main_path / f)
create_file(output_path / "model.config.json", srsly.json_dumps(meta, indent=2))
create_file(output_path / "setup.py", TEMPLATE_SETUP)
create_file(package_path / "__init__.py", TEMPLATE_INIT)
msg.good("Successfully created package '{}'".format(package_path), main_path)
msg.text("To build the package, run `python setup.py sdist` in this directory.")
return str(package_path)
update_models = [m for m in incompat_models if m in current_compat]
spacy_dir = Path(__file__).parent.parent
msg.divider("Installed models (spaCy v{})".format(about.__version__))
msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
if model_links or model_pkgs:
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
rows = []
for name, data in model_pkgs.items():
rows.append(get_model_row(current_compat, name, data, msg))
for name, data in model_links.items():
rows.append(get_model_row(current_compat, name, data, msg, "link"))
msg.table(rows, header=header)
else:
msg.text("No models found in your current environment.", exits=0)
if update_models:
msg.divider("Install updates")
msg.text("Use the following commands to update the model packages:")
cmd = "python -m spacy download {}"
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
if na_models:
msg.text(
"The following models are not available for spaCy "
"v{}: {}".format(about.__version__, ", ".join(na_models))
)
if incompat_links:
msg.text(
"You may also want to overwrite the incompatible links using the "
"`python -m spacy link` command with `--force`, or remove them "
"from the data directory. "
"Data path: {path}".format(path=path2str(get_data_path()))
Step 1: Parse raw text with spaCy
Expects an input file with one sentence per line and will output a .spacy
file of the parsed collection of Doc objects (DocBin).
"""
input_path = Path(in_file)
output_path = Path(out_dir)
if not input_path.exists():
msg.fail("Can't find input file", in_file, exits=1)
if not output_path.exists():
output_path.mkdir(parents=True)
msg.good(f"Created output directory {out_dir}")
nlp = spacy.load(spacy_model)
msg.info(f"Using spaCy model {spacy_model}")
doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
msg.text("Preprocessing text...")
count = 0
batch_num = 0
with input_path.open("r", encoding="utf8") as texts:
docs = nlp.pipe(texts, n_process=n_process)
for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
if count < max_docs:
doc_bin.add(doc)
count += 1
else:
batch_num += 1
count = 0
msg.good(f"Processed {len(doc_bin)} docs")
doc_bin_bytes = doc_bin.to_bytes()
output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
with output_file.open("wb") as f:
f.write(doc_bin_bytes)