How to use the squad.squad_data.SquadCorpus function in squad

To help you get started, we’ve selected a few squad examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github allenai / document-qa / experimental / tmp.py View on Github external
def std():
    vecs = SquadCorpus().get_resource_loader().load_word_vec("glove.840B.300d")
    mat = np.vstack(list(vecs.values()))
    print(mat.shape)
    print(mat.mean(axis=0))
    print(mat.std(axis=0))
    print(mat.mean())
    print(mat.std())
github allenai / document-qa / experimental / tmp.py View on Github external
def main1():
    data = SquadCorpus()
    data.dir = join(config.CORPUS_DIR, "squad-v4")
    data2 = SquadCorpus()
    data2.dir = join(config.CORPUS_DIR, "squad-v2")

    train = data.get_dev()
    train2 = data2.get_dev()
    if len(train) != len(train2):
        raise ValueError()

    for d1, d2 in zip(train, train2):
        if d1.doc_id != d2.doc_id or d1.title != d2.title:
            raise ValueError()
        if len(d1.paragraphs) != len(d2.paragraphs):
            raise ValueError()
        for p1, p2 in zip(d1.paragraphs, d2.paragraphs):
            if p1.text != p2.text or p1.paragraph_num != p2.paragraph_num or p1.original_text != p2.original_text:
                raise ValueError()
            if not np.all(p1.spans == p2.spans):
github allenai / document-qa / train_squad / train4.py View on Github external
attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=SequenceMapperSeq(
            FullyConnected(dim*2, activation="relu"),
            dropout,
        ),
        predictor=BoundsPredictor(
            ChainBiMapper(
                first_layer=recurrent_layer,
                second_layer=recurrent_layer
            ),
        )
    )
    with open(__file__, "r") as f:
        notes = f.read()

    corpus = SquadCorpus()
    train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False)
    eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
    data = DocumentQaTrainingData(corpus, None, train_batching, eval_batching)

    eval = [LossEvaluator(), BoundedSquadSpanEvaluator(bound=[17])]
    trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes)
github allenai / document-qa / train_squad / train_context_only.py View on Github external
out = get_output_name_from_cli()

    train_params = TrainParams(SerializableOptimizer("Adadelta", dict(learning_rate=1.0)),
                               num_epochs=16, eval_period=900, log_period=30,
                               async_encoding=5,
                               save_period=900, eval_samples=dict(train=6000, dev=6000))

    model = ContextOnly(
        DocumentAndQuestionEncoder(SingleSpanAnswerEncoder()),
        FixedWordEmbedder(vec_name="glove.6B.100d", word_vec_init_scale=0, learn_unk=False),
        None,
        FullyConnected(50),
        BoundsPredictor(NullBiMapper())
    )

    corpus = SquadCorpus()
    train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False)
    eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
    data = DocumentQaTrainingData(corpus, None, train_batching, eval_batching)

    eval = [LossEvaluator(), BoundedSquadSpanEvaluator(bound=[17])]
    trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), "")
github allenai / document-qa / experimental / batch_paragraph_selection / batch_paragraph_selection.py View on Github external
print("Top3:  %.4f" % (ranks < 4).mean())
        print("Top5:  %.4f" % (ranks < 6).mean())
        print("Top10: %.4f" % (ranks < 11).mean())


def save_prediction(df, feature, output):
    answer_dict = {}
    for question_id, question_df in df.groupby(level="question_id"):
        values = question_df[feature].sort_index()
        answer_dict[question_id] = list(values.values)

    with open(output, "w") as f:
        json.dump(answer_dict, f)

if __name__ == "__main__":
    corp = SquadCorpus()
    print("Loading...")
    docs = corp.get_train()[:5]
    print("Building features....")
    df = build_features(docs, corp.get_resource_loader(), None, None, seed=0, dev=corp.get_dev()[:5])

    print("Classifier..,")
    get_classifier_dev_scores(df)

    show_eval(df[df.source == "dev"])
github allenai / document-qa / experimental / tmp.py View on Github external
def main1():
    data = SquadCorpus()
    data.dir = join(config.CORPUS_DIR, "squad-v4")
    data2 = SquadCorpus()
    data2.dir = join(config.CORPUS_DIR, "squad-v2")

    train = data.get_dev()
    train2 = data2.get_dev()
    if len(train) != len(train2):
        raise ValueError()

    for d1, d2 in zip(train, train2):
        if d1.doc_id != d2.doc_id or d1.title != d2.title:
            raise ValueError()
        if len(d1.paragraphs) != len(d2.paragraphs):
            raise ValueError()
        for p1, p2 in zip(d1.paragraphs, d2.paragraphs):
            if p1.text != p2.text or p1.paragraph_num != p2.paragraph_num or p1.original_text != p2.original_text: