How to use the squad.build_squad_dataset.SquadCorpus function in squad

To help you get started, we’ve selected a few squad examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github allenai / document-qa / data_analysis / show_dropped_names.py View on Github external
def show_answers():
    print("Loading...")
    squad = False
    if squad:
        corpus = SquadCorpus()
        docs = corpus.get_train()
        data = split_docs(docs)
    else:
        stop = NltkPlusStopWords()
        data = PreprocessedData(TriviaQaWebDataset(),
                                ExtractSingleParagraph(MergeParagraphs(400), TopTfIdf(stop, 1), intern=True),
                                InMemoryWebQuestionBuilder(None, None),
                                eval_on_verified=False
                                )
        data.load_preprocess("triviaqa-web-merge400-tfidf1.pkl.gz")
        data = data.get_train().data
    print("Get voc...")

    detector = NameDetector()
    wc = QaCorpusLazyStats(data).get_word_counts()
    detector.init(wc)
github allenai / document-qa / data_analysis / show_dropped_names.py View on Github external
def show_nn():
    corpus = SquadCorpus()
    print("Load train")
    data = split_docs(corpus.get_train())
    print("Comput stats")
    wc = QaCorpusLazyStats(data).get_word_counts()
    detector = NameDetector(wc)
    print("Load vecs")
    vecs = corpus.get_resource_loader().load_word_vec("glove.840B.300d")

    print('Scanning...')
    names = Counter()
    for word, c in wc.items():
        if detector.is_name(word):
            names[word] = c
    vec_names = [k for k in names if k in vecs]
    print("Have vec for %d/%d (%.4f)" % (len(vec_names), len(names), len(vec_names)/len(names)))
github allenai / document-qa / train_squad / train3.py View on Github external
predictor=WithFixedContextPredictionLayer(
            ResidualLayer(recurrent_layer),
            AttentionEncoder(post_process=MapperSeq(FullyConnected(25, activation="tanh"), DropoutLayer(0.8))),
            WithProjectedProduct(include_tiled=True),
            ChainBiMapper(
                first_layer=recurrent_layer,
                second_layer=recurrent_layer
            ),
            span_predictor=BoundedSpanPredictor(20)
        )
    )

    with open(__file__, "r") as f:
        notes = f.read()

    corpus = SquadCorpus()
    train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False)
    eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
    data = DocumentQaTrainingData(corpus, None, train_batching, eval_batching)

    eval = [LossEvaluator(), BoundedSquadSpanEvaluator(bound=[17])]
    trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes)
github allenai / document-qa / data_analysis / show_dropped_names.py View on Github external
def main():
    embed = DropNamesV2(vec_name="glove.840B.300d",
                        selector=NameDetector(),
                        word_vec_init_scale=0, learn_unk=False,
                        keep_probs=0, kind="shuffle")
    corpus = SquadCorpus()
    squad = False
    print("Loading...")
    if squad:
        docs = corpus.get_train()
        data = split_docs(docs)
    else:
        stop = NltkPlusStopWords()
        data = PreprocessedData(TriviaQaWebDataset(),
                                ExtractSingleParagraph(MergeParagraphs(400), TopTfIdf(stop, 1), intern=True),
                                InMemoryWebQuestionBuilder(None, None),
                                eval_on_verified=False
                                )
        data.load_preprocess("triviaqa-web-merge400-tfidf1.pkl.gz")
        data = data.get_train().data
    print("Get voc...")