Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
vec = item[1:]
if len(vec) != vector_size:
msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1)
all_senses.add(sense)
vectors[key] = numpy.asarray(vec, dtype=numpy.float32)
discarded = set()
discarded.update(get_minority_keys(vocab, min_freq_ratio))
discarded.update(get_redundant_keys(vocab, vectors, min_distance))
n_vectors = len(vectors) - len(discarded)
s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
for key, vector in vectors.items():
if key not in discarded:
s2v.add(key, vector)
s2v.set_freq(key, vocab[key])
msg.good("Created the sense2vec model")
msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses")
s2v.to_disk(output_path)
msg.good("Saved model to directory", out_dir)
commands = {
"download": download,
"link": link,
"info": info,
"train": train,
"pretrain": pretrain,
"debug-data": debug_data,
"evaluate": evaluate,
"convert": convert,
"package": package,
"init-model": init_model,
"profile": profile,
"validate": validate,
}
if len(sys.argv) == 1:
msg.info("Available commands", ", ".join(commands), exits=1)
command = sys.argv.pop(1)
sys.argv[0] = "spacy %s" % command
if command in commands:
plac.call(commands[command], sys.argv[1:])
else:
available = "Available: {}".format(", ".join(commands))
msg.fail("Unknown command: {}".format(command), available, exits=1)
def eval_dataset(set_id):
DB = connect()
data = DB.get_dataset(set_id)
accepted = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")]
rejected = [eg for eg in data if eg["answer"] == "reject"]
ignored = [eg for eg in data if eg["answer"] == "ignore"]
if not accepted and not rejected:
msg.warn("No annotations collected", exits=1)
total_count = 0
agree_count = 0
for eg in accepted:
total_count += len(eg.get("options", []))
agree_count += len(eg.get("accept", []))
msg.info(f"Evaluating data from '{set_id}'")
msg.text(f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
pc = agree_count / total_count
text = f"You agreed {agree_count} / {total_count} times ({pc:.0%})"
if pc > 0.5:
msg.good(text)
else:
msg.fail(text)
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1, max_docs=10**6):
"""
Step 1: Parse raw text with spaCy
Expects an input file with one sentence per line and will output a .spacy
file of the parsed collection of Doc objects (DocBin).
"""
input_path = Path(in_file)
output_path = Path(out_dir)
if not input_path.exists():
msg.fail("Can't find input file", in_file, exits=1)
if not output_path.exists():
output_path.mkdir(parents=True)
msg.good(f"Created output directory {out_dir}")
nlp = spacy.load(spacy_model)
msg.info(f"Using spaCy model {spacy_model}")
doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
msg.text("Preprocessing text...")
count = 0
batch_num = 0
with input_path.open("r", encoding="utf8") as texts:
docs = nlp.pipe(texts, n_process=n_process)
for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
if count < max_docs:
doc_bin.add(doc)
count += 1
else:
batch_num += 1
count = 0
msg.good(f"Processed {len(doc_bin)} docs")
doc_bin_bytes = doc_bin.to_bytes()
output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
if not accepted and not rejected:
msg.warn("No annotations collected", exits=1)
high_conf = 0.8
agree_count = 0
disagree_high_conf = len([e for e in rejected if e["confidence"] > high_conf])
for eg in accepted:
choice = eg["accept"][0]
score_choice = [o["score"] for o in eg["options"] if o["id"] == choice][0]
score_other = [o["score"] for o in eg["options"] if o["id"] != choice][0]
if score_choice > score_other:
agree_count += 1
elif eg["confidence"] > high_conf:
disagree_high_conf += 1
pc = agree_count / (len(accepted) + len(rejected))
text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})"
msg.info(f"Evaluating data from '{set_id}'")
if pc > 0.5:
msg.good(text)
else:
msg.fail(text)
msg.text(f"You disagreed on {disagree_high_conf} high confidence scores")
msg.text(f"You rejected {len(rejected)} suggestions as not similar")
def eval_dataset(set_id):
DB = connect()
data = DB.get_dataset(set_id)
accepted = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")]
rejected = [eg for eg in data if eg["answer"] == "reject"]
ignored = [eg for eg in data if eg["answer"] == "ignore"]
if not accepted and not rejected:
msg.warn("No annotations collected", exits=1)
counts = Counter()
for eg in accepted:
for model_id in eg["accept"]:
counts[model_id] += 1
preference, _ = counts.most_common(1)[0]
ratio = f"{counts[preference]} / {sum(counts.values()) - counts[preference]}"
msg.info(f"Evaluating data from '{set_id}'")
msg.text(f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
if counts["A"] == counts["B"]:
msg.warn(f"No preference ({ratio})")
else:
pc = counts[preference] / sum(counts.values())
msg.good(f"You preferred vectors {preference} with {ratio} ({pc:.0%})")
msg.text(mapping[preference])
msg.fail("Can't find GloVe build directory", glove_dir, exits=1)
if not input_path.exists() or not input_path.is_dir():
msg.fail("Not a valid input directory", in_dir, exits=1)
input_files = [str(fp) for fp in input_path.iterdir() if fp.suffix == ".s2v"]
if not input_files:
msg.fail("No .s2v files found in input directory", in_dir, exits=1)
msg.info(f"Using {len(input_files)} input files")
if not output_path.exists():
output_path.mkdir(parents=True)
msg.good(f"Created output directory {out_dir}")
vocab_file = output_path / f"vocab.txt"
cooc_file = output_path / f"cooccurrence.bin"
cooc_shuffle_file = output_path / f"cooccurrence.shuf.bin"
msg.info("Creating vocabulary counts")
cmd = (
f"cat {' '.join(input_files)} | {glove_dir}/vocab_count "
f"-min-count {min_count} -verbose {verbose} > {vocab_file}"
)
print(cmd)
vocab_cmd = os.system(cmd)
if vocab_cmd != 0 or not Path(vocab_file).exists():
msg.fail("Failed creating vocab counts", exits=1)
msg.good("Created vocab counts", vocab_file)
msg.info("Creating cooccurrence statistics")
cmd = (
f"cat {' '.join(input_files)} | {glove_dir}/cooccur -memory {memory} "
f"-vocab-file {vocab_file} -verbose {verbose} "
f"-window-size {window_size} > {cooc_file}"
)
def profile(model, inputs=None, n_texts=10000):
"""
Profile a spaCy pipeline, to find out which functions take the most time.
Input should be formatted as one JSON object per line with a key "text".
It can either be provided as a JSONL file, or be read from sys.sytdin.
If no input file is specified, the IMDB dataset is loaded via Thinc.
"""
if inputs is not None:
inputs = _read_inputs(inputs, msg)
if inputs is None:
n_inputs = 25000
with msg.loading("Loading IMDB dataset via Thinc..."):
imdb_train, _ = thinc.extra.datasets.imdb()
inputs, _ = zip(*imdb_train)
msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
inputs = inputs[:n_inputs]
with msg.loading("Loading model '{}'...".format(model)):
nlp = load_model(model)
msg.good("Loaded model '{}'".format(model))
texts = list(itertools.islice(inputs, n_texts))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
msg.divider("Profile stats")
s.strip_dirs().sort_stats("time").print_stats()
msg.good("Successfully saved fastText model to disk", output_file)
else:
fasttext_model = None
msg.fail("Must provide an input directory or fastText binary filepath", exits=1)
msg.info("Creating vocabulary file")
vocab_file = output_path / "vocab.txt"
words, freqs = fasttext_model.get_words(include_freq=True)
with vocab_file.open('w', encoding='utf8') as f:
for i in range(len(words)):
f.write(words[i] + " " + str(freqs[i]) + " word\n")
if not vocab_file.exists() or not vocab_file.is_file():
msg.fail("Failed to create vocabulary", vocab_file, exits=1)
msg.good("Successfully created vocabulary file", vocab_file)
msg.info("Creating vectors file")
vectors_file = output_path / "vectors.txt"
# Adapted from https://github.com/facebookresearch/fastText/blob/master/python/doc/examples/bin_to_vec.py#L31
with vectors_file.open('w', encoding='utf-8') as file_out:
# the first line must contain the number of total words and vector dimension
file_out.write(str(len(words)) + " " + str(fasttext_model.get_dimension()) + '\n')
# line by line, append vector to vectors file
for w in words:
v = fasttext_model.get_word_vector(w)
vstr = ""
for vi in v:
vstr += " " + str(vi)
try:
file_out.write(w + vstr + '\n')
except IOError as e:
if e.errno == EPIPE:
pass