How to use the thinc.extra.datasets.imdb function in thinc

To help you get started, we’ve selected a few thinc examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github explosion / spacy-transformers / examples / test_wordpiece_alignment.py View on Github external
this script should always pass.

    * retry: If alignment fails after cleaning and normalizing both sets of
        tokens, try again with a more aggressive strategy that strips out all
        characters that are not uppercase/lowercase letters.
    * force: If alignment still fails, run the word-piece tokenizer on the
        individual spaCy tokens, so that alignment is trivial. This should
        always work.
    """
    cfg = {"retry_alignment": retry, "force_alignment": force}
    nlp = get_lang_class(lang)()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    wp = TransformersWordPiecer.from_pretrained(nlp.vocab, trf_name=name, **cfg)
    msg.good(f"Loaded WordPiecer for model '{name}'")
    with msg.loading("Loading IMDB data..."):
        data, _ = thinc.extra.datasets.imdb(limit=n_texts)
    texts, _ = zip(*data)
    msg.good(f"Using {len(texts)} texts from IMDB data")
    msg.info("Processing texts...")
    sent_counts = 0
    for doc in tqdm.tqdm(nlp.pipe(texts), total=len(texts)):
        try:
            doc = wp(doc)
            sent_counts += len(list(doc.sents))
        except AssertionError as e:
            if len(e.args) and isinstance(e.args[0], tuple):  # Misaligned error
                a, b = e.args[0]
                msg.fail("Misaligned tokens")
                print(diff_strings(a, b))
                if not skip:
                    sys.exit(1)
            elif len(e.args):
github explosion / spacy-transformers / examples / train_textcat.py View on Github external
def load_data_for_final_test(*, limit=0):
    print(
        "Warning: Using test data. You should use development data for most experiments."
    )
    train_data, test_data = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    train_texts, train_labels = _prepare_partition(train_data)
    test_texts, test_labels = _prepare_partition(test_data)
    return (train_texts, train_labels), (test_texts, test_labels)
github explosion / spacy-transformers / examples / train_textcat.py View on Github external
def load_data(*, limit=0, dev_size=2000):
    """Load data from the IMDB dataset, splitting off a held-out set."""
    if limit != 0:
        limit += dev_size
    assert dev_size != 0
    train_data, _ = thinc.extra.datasets.imdb(limit=limit)
    assert len(train_data) > dev_size
    random.shuffle(train_data)
    dev_data = train_data[:dev_size]
    train_data = train_data[dev_size:]
    train_texts, train_labels = _prepare_partition(train_data, preprocess=False)
    dev_texts, dev_labels = _prepare_partition(dev_data, preprocess=False)
    return (train_texts, train_labels), (dev_texts, dev_labels)
github explosion / thinc / examples / ngram_bow.py View on Github external
def main(use_gpu=False, nb_epoch=50):
    if use_gpu:
        Model.ops = CupyOps()
        Model.Ops = CupyOps
    train, test = datasets.imdb()
    print("Load data")
    train_X, train_y = zip(*train)
    test_X, test_y = zip(*test)
    train_y = to_categorical(train_y, nb_classes=2)
    test_y = to_categorical(test_y, nb_classes=2)

    nlp = Language()

    dev_X = train_X[-1000:]
    dev_y = train_y[-1000:]
    train_X = train_X[:-1000]
    train_y = train_y[:-1000]
    print("Parse data")
    train_X = [nlp.make_doc(x) for x in train_X]
    dev_X = [nlp.make_doc(x) for x in dev_X]
github explosion / thinc / examples / imdb_attention.py View on Github external
def main(use_gpu=False, nb_epoch=100):
    fix_random_seed(0)
    if use_gpu:
        require_gpu()
    train, test = datasets.imdb(limit=2000)
    print("Load data")
    train_X, train_y = zip(*train)
    test_X, test_y = zip(*test)
    train_y = Model.ops.asarray(to_categorical(train_y, nb_classes=2))
    test_y = Model.ops.asarray(to_categorical(test_y, nb_classes=2))

    nlp = spacy.load("en_vectors_web_lg")
    nlp.add_pipe(nlp.create_pipe("sentencizer"), first=True)
    register_vectors(Model.ops, nlp.vocab.vectors.name, nlp.vocab.vectors.data)

    preprocessor = FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
    train_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(train_X))]
    test_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(test_X))]

    dev_X = train_X[-1000:]
    dev_y = train_y[-1000:]
github buriy / spacy-ru / examples / classifier.py View on Github external
def load_data(limit=0, split=0.8):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])
github explosion / spaCy / examples / pipeline / multi_processing.py View on Github external
def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10000):
    nlp = spacy.load(model)  # load spaCy model
    print("Loaded model '%s'" % model)
    if not output_dir.exists():
        output_dir.mkdir()
    # load and pre-process the IMBD dataset
    print("Loading IMDB data...")
    data, _ = thinc.extra.datasets.imdb()
    texts, _ = zip(*data[-limit:])
    print("Processing texts...")
    partitions = minibatch(texts, size=batch_size)
    executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes")
    do = delayed(partial(transform_texts, nlp))
    tasks = (do(i, batch, output_dir) for i, batch in enumerate(partitions))
    executor(tasks)
github explosion / spaCy / examples / training / pretrain_textcat.py View on Github external
def load_texts(limit=0):
    train, dev = thinc.extra.datasets.imdb()
    train_texts, train_labels = zip(*train)
    dev_texts, dev_labels = zip(*train)
    train_texts = list(train_texts)
    dev_texts = list(dev_texts)
    random.shuffle(train_texts)
    random.shuffle(dev_texts)
    if limit >= 1:
        return train_texts[:limit]
    else:
        return list(train_texts) + list(dev_texts)
github honnibal / spacy-pretrain-polyaxon / lmao-imdb-1k / pretrain_textcat.py View on Github external
def load_textcat_data(is_final_result=False, limit=0):
    """Load data from the IMDB dataset."""
    train_data, eval_data = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    if not is_final_result:
        # Partition off part of the train data for evaluation
        eval_data = train_data[-5000:]
        train_data = train_data[:-5000]
    train_data = train_data[-limit:]
    train_texts, train_labels = zip(*train_data)
    eval_texts, eval_labels = zip(*eval_data)
    train_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in train_labels]
    eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
    return (train_texts, train_cats), (eval_texts, eval_cats)