Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
this script should always pass.
* retry: If alignment fails after cleaning and normalizing both sets of
tokens, try again with a more aggressive strategy that strips out all
characters that are not uppercase/lowercase letters.
* force: If alignment still fails, run the word-piece tokenizer on the
individual spaCy tokens, so that alignment is trivial. This should
always work.
"""
cfg = {"retry_alignment": retry, "force_alignment": force}
nlp = get_lang_class(lang)()
nlp.add_pipe(nlp.create_pipe("sentencizer"))
wp = TransformersWordPiecer.from_pretrained(nlp.vocab, trf_name=name, **cfg)
msg.good(f"Loaded WordPiecer for model '{name}'")
with msg.loading("Loading IMDB data..."):
data, _ = thinc.extra.datasets.imdb(limit=n_texts)
texts, _ = zip(*data)
msg.good(f"Using {len(texts)} texts from IMDB data")
msg.info("Processing texts...")
sent_counts = 0
for doc in tqdm.tqdm(nlp.pipe(texts), total=len(texts)):
try:
doc = wp(doc)
sent_counts += len(list(doc.sents))
except AssertionError as e:
if len(e.args) and isinstance(e.args[0], tuple): # Misaligned error
a, b = e.args[0]
msg.fail("Misaligned tokens")
print(diff_strings(a, b))
if not skip:
sys.exit(1)
elif len(e.args):
def load_data_for_final_test(*, limit=0):
print(
"Warning: Using test data. You should use development data for most experiments."
)
train_data, test_data = thinc.extra.datasets.imdb()
random.shuffle(train_data)
train_data = train_data[-limit:]
train_texts, train_labels = _prepare_partition(train_data)
test_texts, test_labels = _prepare_partition(test_data)
return (train_texts, train_labels), (test_texts, test_labels)
def load_data(*, limit=0, dev_size=2000):
"""Load data from the IMDB dataset, splitting off a held-out set."""
if limit != 0:
limit += dev_size
assert dev_size != 0
train_data, _ = thinc.extra.datasets.imdb(limit=limit)
assert len(train_data) > dev_size
random.shuffle(train_data)
dev_data = train_data[:dev_size]
train_data = train_data[dev_size:]
train_texts, train_labels = _prepare_partition(train_data, preprocess=False)
dev_texts, dev_labels = _prepare_partition(dev_data, preprocess=False)
return (train_texts, train_labels), (dev_texts, dev_labels)
def main(use_gpu=False, nb_epoch=50):
if use_gpu:
Model.ops = CupyOps()
Model.Ops = CupyOps
train, test = datasets.imdb()
print("Load data")
train_X, train_y = zip(*train)
test_X, test_y = zip(*test)
train_y = to_categorical(train_y, nb_classes=2)
test_y = to_categorical(test_y, nb_classes=2)
nlp = Language()
dev_X = train_X[-1000:]
dev_y = train_y[-1000:]
train_X = train_X[:-1000]
train_y = train_y[:-1000]
print("Parse data")
train_X = [nlp.make_doc(x) for x in train_X]
dev_X = [nlp.make_doc(x) for x in dev_X]
def main(use_gpu=False, nb_epoch=100):
fix_random_seed(0)
if use_gpu:
require_gpu()
train, test = datasets.imdb(limit=2000)
print("Load data")
train_X, train_y = zip(*train)
test_X, test_y = zip(*test)
train_y = Model.ops.asarray(to_categorical(train_y, nb_classes=2))
test_y = Model.ops.asarray(to_categorical(test_y, nb_classes=2))
nlp = spacy.load("en_vectors_web_lg")
nlp.add_pipe(nlp.create_pipe("sentencizer"), first=True)
register_vectors(Model.ops, nlp.vocab.vectors.name, nlp.vocab.vectors.data)
preprocessor = FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
train_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(train_X))]
test_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(test_X))]
dev_X = train_X[-1000:]
dev_y = train_y[-1000:]
def load_data(limit=0, split=0.8):
"""Load data from the IMDB dataset."""
# Partition off part of the train data for evaluation
train_data, _ = thinc.extra.datasets.imdb()
random.shuffle(train_data)
train_data = train_data[-limit:]
texts, labels = zip(*train_data)
cats = [{'POSITIVE': bool(y)} for y in labels]
split = int(len(train_data) * split)
return (texts[:split], cats[:split]), (texts[split:], cats[split:])
def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10000):
nlp = spacy.load(model) # load spaCy model
print("Loaded model '%s'" % model)
if not output_dir.exists():
output_dir.mkdir()
# load and pre-process the IMBD dataset
print("Loading IMDB data...")
data, _ = thinc.extra.datasets.imdb()
texts, _ = zip(*data[-limit:])
print("Processing texts...")
partitions = minibatch(texts, size=batch_size)
executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes")
do = delayed(partial(transform_texts, nlp))
tasks = (do(i, batch, output_dir) for i, batch in enumerate(partitions))
executor(tasks)
def load_texts(limit=0):
train, dev = thinc.extra.datasets.imdb()
train_texts, train_labels = zip(*train)
dev_texts, dev_labels = zip(*train)
train_texts = list(train_texts)
dev_texts = list(dev_texts)
random.shuffle(train_texts)
random.shuffle(dev_texts)
if limit >= 1:
return train_texts[:limit]
else:
return list(train_texts) + list(dev_texts)
def load_textcat_data(is_final_result=False, limit=0):
"""Load data from the IMDB dataset."""
train_data, eval_data = thinc.extra.datasets.imdb()
random.shuffle(train_data)
if not is_final_result:
# Partition off part of the train data for evaluation
eval_data = train_data[-5000:]
train_data = train_data[:-5000]
train_data = train_data[-limit:]
train_texts, train_labels = zip(*train_data)
eval_texts, eval_labels = zip(*eval_data)
train_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in train_labels]
eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
return (train_texts, train_cats), (eval_texts, eval_cats)