Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
optimizer.L2 = 0.0
learn_rates = cyclic_triangular_rate(
learn_rate / 3, learn_rate * 3, 2 * len(train_data) // batch_size
)
pbar = tqdm.tqdm(total=100, leave=False)
results = []
epoch = 0
step = 0
eval_every = 100
patience = 3
while True:
# Train and evaluate
losses = Counter()
random.shuffle(train_data)
batches = minibatch(train_data, size=batch_size)
for batch in batches:
optimizer.trf_lr = next(learn_rates)
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.1, losses=losses)
pbar.update(1)
if step and (step % eval_every) == 0:
pbar.close()
with nlp.use_params(optimizer.averages):
scores = evaluate_multiclass(nlp, eval_texts, eval_cats)
results.append((scores["textcat_acc"], step, epoch))
print(
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(
losses["trf_textcat"],
scores["textcat_acc"],
scores["textcat_cor"],
scores["textcat_wrg"],
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
if lang is None:
raise ValueError("No --lang specified, but tokenization required")
json_docs = []
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
nlp = get_lang_class(lang)()
sentencizer = nlp.create_pipe("sentencizer")
for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
docs = []
for record in batch:
raw_text = record["text"]
if "entities" in record:
ents = record["entities"]
else:
ents = record["spans"]
ents = [(e["start"], e["end"], e["label"]) for e in ents]
doc = nlp.make_doc(raw_text)
sentencizer(doc)
spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
doc.ents = _cleanup_spans(spans)
docs.append(doc)
json_docs.append(docs_to_json(docs, id=i))
return json_docs
def merge_sentences(docs, n_sents):
merged = []
for group in minibatch(docs, size=n_sents):
group = list(group)
first = group.pop(0)
to_extend = first["paragraphs"][0]["sentences"]
for sent in group:
to_extend.extend(sent["paragraphs"][0]["sentences"])
merged.append(first)
return merged
with nlp.disable_pipes(*other_pipes): # only train Entity Linking
optimizer = nlp.begin_training()
optimizer.learn_rate = lr
optimizer.L2 = l2
logger.info("Training on {} articles".format(len(train_data)))
logger.info("Dev testing on {} articles".format(len(dev_data)))
# baseline performance on dev data
logger.info("Dev Baseline Accuracies:")
measure_performance(dev_data, kb, el_pipe, baseline=True, context=False)
for itn in range(epochs):
random.shuffle(train_data)
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
batchnr = 0
with nlp.disable_pipes(*other_pipes):
for batch in batches:
try:
docs, golds = zip(*batch)
nlp.update(
docs=docs,
golds=golds,
sgd=optimizer,
drop=dropout,
losses=losses,
)
batchnr += 1
except Exception as e:
logger.error("Error updating batch:" + str(e))
with nlp.disable_pipes(*other_pipes): # only train textcat
# Params arent passed in properly in spaCy :(. Work around the bug.
optimizer = nlp.begin_training()
configure_optimizer(optimizer, opt_params)
if init_tok2vec is not None:
with Path(init_tok2vec).open('rb') as file_:
textcat.model.tok2vec.from_bytes(file_.read())
print("Training the model...")
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
for i in range(n_iter):
losses = {"textcat": 0.0}
if USE_TQDM:
# If we're using the CLI, a progress bar is nice.
train_data = tqdm.tqdm(train_data, leave=False)
# batch up the examples using spaCy's minibatch
batches = minibatch(train_data, size=batch_size)
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(
texts, annotations, sgd=optimizer, drop=dropout, losses=losses
)
with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data()
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
best_acc = max(best_acc, scores["acc"])
report_progress(i, best_acc, losses, scores)
should_stop = early_stopping.update(scores)
if should_stop:
break
n_texts, len(train_texts), len(dev_texts)
)
)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
with nlp.disable_pipes(*other_pipes): # only train textcat
optimizer = nlp.begin_training()
textcat.model.tok2vec.from_bytes(tok2vec_weights)
print("Training the model...")
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
for i in range(n_iter):
losses = {"textcat": 0.0}
# batch up the examples using spaCy's minibatch
batches = minibatch(tqdm.tqdm(train_data), size=2)
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data()
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
print(
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
losses["textcat"],
scores["textcat_p"],
scores["textcat_r"],
scores["textcat_f"],
)
def evaluate(nlp, task, docs_golds):
tok2vec = nlp.get_pipe(PIPES.tok2vec)
textcat = nlp.get_pipe(PIPES.textcat)
right = 0
total = 0
guesses = []
truths = []
labels = textcat.labels
for batch in minibatch(docs_golds, size=HP.eval_batch_size):
docs, golds = zip(*batch)
docs = list(textcat.pipe(tok2vec.pipe(docs)))
for doc, gold in zip(docs, golds):
guess, _ = max(doc.cats.items(), key=lambda it: it[1])
truth, _ = max(gold.cats.items(), key=lambda it: it[1])
if guess not in labels:
msg = (
f"Unexpected label {guess} predicted. "
f"Expectded labels: {', '.join(labels)}"
)
raise ValueError(msg)
if truth not in labels:
msg = (
f"Unexpected label {truth} predicted. "
f"Expectded labels: {', '.join(labels)}"
)
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes): # only train NER
# Setup lists to store the loss for each epoch
self.losses_train = []
self.losses_test = []
# reset and initialize the weights randomly โ but only if we're
# training a new model
if self.blank:
nlp.begin_training()
for epoch in range(self.epochs):
random.shuffle(self.train)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(self.train, size=self.batch_size)
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(
texts, # batch of texts
annotations, # batch of annotations
drop=self.drop, # dropout - make it harder to memorise data
losses=losses,
)
# Store loss for the epoch to a list
self.losses_train.append(('Epoch {}'.format(epoch+1), losses['ner']))
# Debug information is printed to the terminal and logs if the paramater debug = true
if self.debug:
self._print_log(8)
# If a test dataset is available, calculate losses for it as well
parser = nlp.get_pipe("parser")
# add labels to the parser
for _, annotations in TRAIN_DATA:
for dep in annotations.get("deps", []):
parser.add_label(dep)
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
with nlp.disable_pipes(*other_pipes): # only train parser
optimizer = nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print("Losses", losses)
# test the trained model
test_text = "I like securities."
doc = nlp(test_text)
print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
# save model to output directory
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
try:
iter_since_best = 0
best_score = 0.0
for i in range(n_iter):
train_docs = corpus.train_docs(
nlp,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
gold_preproc=gold_preproc,
max_length=0,
ignore_misaligned=True,
)
if raw_text:
random.shuffle(raw_text)
raw_batches = util.minibatch(
(nlp.make_doc(rt["text"]) for rt in raw_text), size=8
)
words_seen = 0
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
losses = {}
for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
if not batch:
continue
docs, golds = zip(*batch)
nlp.update(
docs,
golds,
sgd=optimizer,
drop=next(dropout_rates),
losses=losses,
)