Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_build_training_data():
train_batching = ClusteredBatcher(60, ContextLenBucketedKey(3), True, False)
eval_batching = ClusteredBatcher(60, ContextLenKey(), False, False)
data = PreprocessedData(SquadCorpus(),
TagTextAnswers(),
ParagraphAndQuestionDatasetBuilder(train_batching, eval_batching),
eval_on_verified=False,
sample=20, sample_dev=20
# sample_dev=100, sample=100, eval_on_verified=False
)
data.preprocess()
data = data.get_train()
for batch in data.get_epoch():
for x in batch:
print(x.answer.answer_spans.shape)
def show():
stop = NltkPlusStopWords(True)
prep = ExtractSingleParagraph(MergeParagraphs(400), TopTfIdf(stop, 3),
WithIndicators(True, True), intern=True)
train_batching = ClusteredBatcher(60, ContextLenBucketedKey(3), True, False)
eval_batching = ClusteredBatcher(60, ContextLenKey(), False, False)
builder = ParagraphAndQuestionsBuilder(train_batching, eval_batching)
data = PreprocessedData(TriviaQaWebDataset(), prep, builder, eval_on_verified=False,
sample_dev=20, sample=100)
data.preprocess(1)
for batch in list(data.get_train().get_epoch())[:10]:
for point in batch:
print(" ".join(point.question))
print(point.answer.answer_text)
context = list(point.get_context())
for s,e in point.answer.answer_spans:
context[s] = "{{" + context[s]
context[e] = context[e] + "}}"
print(" ".join(context))
input()
predictor=WithFixedContextPredictionLayer(
# BiRecurrentMapper(GruCellSpec(40)),
ResidualLayer(BiRecurrentMapper(GruCellSpec(80))),
AttentionEncoder(post_process=MapperSeq(FullyConnected(25, activation="tanh"), DropoutLayer(0.8))),
WithProjectedProduct(include_tiled=True),
ChainBiMapper(
first_layer=BiRecurrentMapper(GruCellSpec(80)),
second_layer=BiRecurrentMapper(GruCellSpec(80))
),
aggregate="sum"
)
)
with open(__file__, "r") as f:
notes = f.read()
train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False)
eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
data = PreprocessedData(SquadCorpus(),
TagTextAnswers(),
ParagraphAndQuestionDatasetBuilder(train_batching, eval_batching),
# sample=20, sample_dev=20,
eval_on_verified=False)
data.preprocess()
eval = [LossEvaluator(), BoundedSquadSpanEvaluator(bound=[17])]
trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes, False)
)),
VariationalDropoutLayer(0.8)),
predictor=ConfidencePredictor(
ChainBiMapper(
first_layer=recurrent_layer,
second_layer=recurrent_layer,
),
AttentionEncoder(),
FullyConnected(80, activation="tanh"),
aggregate="sum"
)
)
with open(__file__, "r") as f:
notes = f.read()
train_batching = ClusteredBatcher(60, ContextLenBucketedKey(3), True, False)
eval_batching = ClusteredBatcher(60, ContextLenKey(), False, False)
builder = RandomParagraphsBuilder(train_batching, eval_batching, 0.5)
prep = ExtractMultiParagraphsPerQuestion(MergeParagraphs(400),
ShallowOpenWebRanker(12),
intern=True,
require_an_answer=True)
data = PreprocessedData(TriviaQaWebDataset(), prep, builder, eval_on_verified=False)
eval = [LossEvaluator(), ConfidenceEvaluator(8)]
data.preprocess(6, 1000)
data.cache_preprocess("tfidf-open-top12.pkl.gz")
# data.load_preprocess("tfidf-open-top12.pkl.gz")
trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes)
predictor=WithFixedContextPredictionLayer(
ResidualLayer(recurrent_layer),
AttentionEncoder(post_process=MapperSeq(FullyConnected(25, activation="tanh"), DropoutLayer(0.8))),
WithProjectedProduct(include_tiled=True),
ChainBiMapper(
first_layer=recurrent_layer,
second_layer=recurrent_layer
),
IndependentBoundsJointLoss()
)
)
with open(__file__, "r") as f:
notes = f.read()
train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False)
eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
data = DocumentQaTrainingData(SquadCorpus(), None, train_batching, eval_batching)
eval = [LossEvaluator(), SpanProbability(), BoundedSquadSpanEvaluator(bound=[17])]
trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes, False)
StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()),
FullyConnected(dim * 2, activation="relu")
)),
VariationalDropoutLayer(0.8)),
predictor=BoundsPredictor(
ChainBiMapper(
first_layer=recurrent_layer,
second_layer=recurrent_layer
),
)
)
with open(__file__, "r") as f:
notes = f.read()
train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False)
eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
data = DocumentQaTrainingData(SquadCorpus(), None, train_batching, eval_batching)
eval = [LossEvaluator(), SpanProbability(), BoundedSquadSpanEvaluator(bound=[17])]
trainer.start_training(data, model, train_params, eval, trainer.ModelDir(out), notes)
DropoutLayer(0.8),
BiRecurrentMapper(GruCellSpec(50)),
FullyConnected(30, activation="tanh")
),
merge_with_features=ConcatLayer(),
map_joint=NullMapper(),
encode_joint_features=RecurrentEncoder(GruCellSpec(25), None),
process=SequenceMapperSeq(BiRecurrentMapper(GruCellSpec(25)), FullyConnected(10)),
predictor=SoftmaxPrediction(),
any_features=True
)
with open(__file__, "r") as f:
notes = f.read()
train_batching = ClusteredBatcher(45, NParagraphsSortKey(), True, False)
eval_batching = ClusteredBatcher(45, NParagraphsSortKey(), False, False)
data = PreprocessedData(
TriviaQaWebDataset(), fe,
SelectionWithContextDatasetBuilder(train_batching, eval_batching),
eval_on_verified=False,
hold_out_train=(0, 5000),
# sample=200, sample_dev=200,
)
# data.preprocess(8, chunk_size=1000)
# data.cache_preprocess("unigram-para-held-out.pkl")
data.load_preprocess("unigram-para-held-out.pkl")
eval = [LossEvaluator(), AnyTopNEvaluator([1, 2, 3, 4]), PercentAnswerEvaluator([1,2,3,4]), TotalAnswersEvaluator([1,2,3,4])]
trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes, False)
ChainBiMapper(
first_layer=recurrent_layer,
second_layer=recurrent_layer,
),
span_predictor=IndependentBoundsNoAnswerOption()
)
)
with open(__file__, "r") as f:
notes = f.read()
stop = NltkPlusStopWords(True)
prep = ExtractMultiParagraphs(MergeParagraphs(400), TopTfIdf(stop, 4),
model.preprocessor, intern=True, require_an_answer=True)
eval_batching = ClusteredBatcher(150, ContextLenKey(), False, False)
eval_builder = RandomParagraphsBuilder(eval_batching, eval_batching, 0.5)
train_builder = StratifyParagraphSetsBuilder(35, 35, True, True)
data = PreprocessedData(TriviaQaWebDataset(), prep, train_builder, eval_builder, eval_on_verified=False)
data.preprocess(6, 1000)
eval = [LossEvaluator(), ConfidenceEvaluator(8)]
trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes, None)
first_layer=recurrent_layer,
second_layer=recurrent_layer,
),
AttentionEncoder(),
FullyConnected(80, activation="tanh"),
aggregate="sum"
)
)
with open(__file__, "r") as f:
notes = f.read()
prep = ExtractMultiParagraphsPerQuestion(MergeParagraphs(400), ShallowOpenWebRanker(16),
model.preprocessor, intern=True, require_an_answer=True)
eval_batching = ClusteredBatcher(180, ContextLenKey(), False, True)
train_batching = ClusteredBatcher(60, ContextLenBucketedKey(3), True, True)
eval_builder = RandomParagraphsBuilder(eval_batching, eval_batching, 0.5, 2)
train_builder = StratifyParagraphsBuilder(train_batching, 2)
data = PreprocessedData(TriviaQaWebDataset(), prep, train_builder, eval_builder, eval_on_verified=False)
data.preprocess(6, 1000)
eval = [LossEvaluator(), ConfidenceEvaluator(8)]
trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes, None)