Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def download(model, direct=False, *pip_args):
"""
Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name
with version. For direct downloads, the compatibility check will be skipped.
"""
if not require_package("spacy") and "--no-deps" not in pip_args:
msg.warn(
"Skipping model package dependencies and setting `--no-deps`. "
"You don't seem to have the spaCy package itself installed "
"(maybe because you've built from source?), so installing the "
"model dependencies would cause spaCy to be downloaded, which "
"probably isn't what you want. If the model package has other "
"dependencies, you'll have to install them manually."
)
pip_args = pip_args + ("--no-deps",)
dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}"
if direct:
components = model.split("-")
model_name = "".join(components[:-1])
version = components[-1]
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
else:
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
ignored = [eg for eg in data if eg["answer"] == "ignore"]
if not accepted and not rejected:
msg.warn("No annotations collected", exits=1)
total_count = 0
agree_count = 0
for eg in accepted:
total_count += len(eg.get("options", []))
agree_count += len(eg.get("accept", []))
msg.info(f"Evaluating data from '{set_id}'")
msg.text(f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
pc = agree_count / total_count
text = f"You agreed {agree_count} / {total_count} times ({pc:.0%})"
if pc > 0.5:
msg.good(text)
else:
msg.fail(text)
import importlib
import gym # noqa
setup_tf_env()
model_module_full: str = match.group(1)
model_type: str = match.group(2)
mathy_python = Path(__file__).parent.parent.parent / "mathy_python"
model_file_name = os.path.join(
mathy_python, model_module_full.replace(".", os.path.sep) + ".py"
)
assert os.path.exists(model_file_name), f"model file not found: {model_file_name}"
model_hash = hashlib.md5(open(model_file_name, "r").read().encode()).hexdigest()
if model_hash in model_hashes:
return model_hashes[model_hash]
try:
with msg.loading(f"Loading model: {model_module_full}"):
model_mod = importlib.import_module(model_module_full)
if hasattr(model_mod, model_type) is False:
return (
f"Failed to render architecture because module has no {model_type}"
)
model_fn = getattr(model_mod, model_type)
model = model_fn()
dot = model_to_dot(
model,
show_shapes=True,
show_classes=True,
show_layer_names=True,
rankdir="TB",
dpi=64,
)
def worker(worker_idx: int, work_queue: Queue, result_queue: Queue):
"""Pull items out of the work queue and execute episodes until there are
no items left """
game = self.get_env()
predictor = self.get_model(game)
msg.good(f"Worker {worker_idx} started.")
while (
ParallelPracticeRunner.request_quit is False
and work_queue.empty() is False
):
episode, args = work_queue.get()
start = time.time()
try:
(
episode_examples,
episode_reward,
is_win,
problem,
) = self.execute_episode(
episode,
game,
("Docs", f"{n_docs:,}"),
("Words", f"{n_words:,}"),
("Words/s", f"{wps:,}"),
]
msg.table(result, widths=(7, 12), aligns=("l", "r"))
if __name__ == "__main__":
opts = {"train": train_model, "evaluate": evaluate_model, "wps": wps}
cmd = sys.argv.pop(1)
if cmd not in opts:
msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
try:
plac.call(opts[cmd])
except KeyboardInterrupt:
msg.warn("Stopped.", exits=1)
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
output directory, and model data will be copied over. If --create-meta is
set and a meta.json already exists in the output directory, the existing
values will be used as the defaults in the command-line prompt.
"""
input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path)
if not input_path or not input_path.exists():
msg.fail("Can't locate model data", input_path, exits=1)
if not output_path or not output_path.exists():
msg.fail("Output directory not found", output_path, exits=1)
if meta_path and not meta_path.exists():
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta_path = meta_path or input_path / "meta.json"
if meta_path.is_file():
meta = srsly.read_json(meta_path)
if not create_meta: # only print if user doesn't want to overwrite
msg.good("Loaded meta.json from file", meta_path)
else:
meta = generate_meta(input_dir, meta, msg)
for key in ("lang", "name", "version"):
if key not in meta or meta[key] == "":
msg.fail(
"No '{}' setting found in meta.json".format(key),
"This setting is required to build your package.",
exits=1,
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1, max_docs=10**6):
"""
Step 1: Parse raw text with spaCy
Expects an input file with one sentence per line and will output a .spacy
file of the parsed collection of Doc objects (DocBin).
"""
input_path = Path(in_file)
output_path = Path(out_dir)
if not input_path.exists():
msg.fail("Can't find input file", in_file, exits=1)
if not output_path.exists():
output_path.mkdir(parents=True)
msg.good(f"Created output directory {out_dir}")
nlp = spacy.load(spacy_model)
msg.info(f"Using spaCy model {spacy_model}")
doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
msg.text("Preprocessing text...")
count = 0
batch_num = 0
with input_path.open("r", encoding="utf8") as texts:
docs = nlp.pipe(texts, n_process=n_process)
for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
if count < max_docs:
doc_bin.add(doc)
count += 1
else:
batch_num += 1
count = 0
msg.good(f"Processed {len(doc_bin)} docs")
Expects a binary .spacy input file consisting of the parsed Docs (DocBin)
and outputs a text file with one sentence per line in the expected sense2vec
format (merged noun phrases, concatenated phrases with underscores and
added "senses").
Example input:
Rats, mould and broken furniture: the scandal of the UK's refugee housing
Example output:
Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT
the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN
"""
input_path = Path(in_file)
output_path = Path(out_dir)
if not input_path.exists():
msg.fail("Can't find input file", in_file, exits=1)
if not output_path.exists():
output_path.mkdir(parents=True)
msg.good(f"Created output directory {out_dir}")
nlp = spacy.load(spacy_model)
msg.info(f"Using spaCy model {spacy_model}")
with input_path.open("rb") as f:
doc_bin_bytes = f.read()
doc_bin = DocBin().from_bytes(doc_bin_bytes)
msg.good(f"Loaded {len(doc_bin)} parsed docs")
docs = doc_bin.get_docs(nlp.vocab)
output_file = output_path / f"{input_path.stem}.s2v"
lines_count = 0
words_count = 0
with output_file.open("w", encoding="utf8") as f:
for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
doc = merge_phrases(doc)
batch_size=10,
eval_whole=False,
eval_only=False,
show_scores=False,
):
"""
Evaluate a sense2vec model by asking about phrase triples: is word A more
similar to word B, or to word C? If the human mostly agrees with the model,
the vectors model is good.
"""
random.seed(0)
log("RECIPE: Starting recipe sense2vec.eval", locals())
strategies = eval_strategies.get_all()
if strategy not in strategies.keys():
err = f"Invalid strategy '{strategy}'. Expected: {list(strategies.keys())}"
msg.fail(err, exits=1)
s2v = Sense2Vec().from_disk(vectors_path)
log("RECIPE: Loaded sense2vec vectors", vectors_path)
def get_html(key, score=None, large=False):
word, sense = s2v.split_key(key)
html_word = f"<span style="font-size: {30 if large else 20}px">{word}</span>"
html_sense = f"<strong style="opacity: 0.75; font-size: 14px; padding-left: 10px">{sense}</strong>"
html = f"{html_word} {html_sense}"
if show_scores and score is not None:
html += f" <span style="opacity: 0.75; font-size: 12px; padding-left: 10px">{score:.4}</span>"
return html
def get_stream():
strategy_func = eval_strategies.get(strategy)
log(f"RECIPE: Using strategy {strategy}")
# Limit to most frequent entries