Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def submit_result(self, submit_msg='KakaoAutoML'):
"""
Submit result to kaggle and wait for getting the result.
"""
logger.info('kaggle.submit_result: initialization')
api_client = KaggleApi()
api_client.authenticate()
submissions = api_client.competitionSubmissions(KaggleSubmission.CNAME)
last_idx = submissions[0].ref if len(submissions) > 0 else -1
# submit
logger.info('kaggle.submit_result: trying to submit @ %s' % self.get_filepath())
submit_result = api_client.competitionSubmit(self.get_filepath(), submit_msg, KaggleSubmission.CNAME)
logger.info('kaggle.submit_result: submitted!')
# wait for the updated LB
wait_interval = 10 # in seconds
for _ in range(60 // wait_interval * 5):
submissions = api_client.competitionSubmissions(KaggleSubmission.CNAME)
if len(submissions) == 0:
continue
if submissions[0].status == 'complete' and submissions[0].ref != last_idx:
def kernel_submit(self):
self.info('kernel_submit updating dataset')
folder = 'submit'
os.makedirs(folder, exist_ok=True)
shutil.copy(self.file, os.path.join(folder, self.file_name))
config = api.read_config_file()
username = config['username']
title = f'{self.competition}-{self.kernel_suffix}-dataset'
dataset_meta = {
'title': f'{self.competition}-{self.kernel_suffix}-dataset',
'id': f'{username}/{title}',
'licenses': [{
'name': 'CC0-1.0'
}]
}
with open(f'{folder}/dataset-metadata.json', 'w') as f:
json.dump(dataset_meta, f)
res = api.dataset_status(dataset_meta['id'])
if res != 'ready':
res = api.dataset_create_new(folder=folder)
if res.status == 'error':
DATA_DIR = "../data/comp_data"
MODEL_DIR = "../data/models"
WORD2VEC_BIN = "GoogleNews-vectors-negative300.bin.gz"
WORD2VEC_EMBED_SIZE = 300
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"
QA_EMBED_SIZE = 64
BATCH_SIZE = 32
NBR_EPOCHS = 20
## extract data
print("Loading and formatting data...")
qapairs = kaggle.get_question_answer_pairs(
os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([question_maxlen, answer_maxlen])
word2idx = kaggle.build_vocab([], qapairs, [])
vocab_size = len(word2idx) + 1 # include mask character 0
Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)
Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape,
Ytrain.shape, Ytest.shape)
# get embeddings from word2vec
# see https://github.com/fchollet/keras/issues/853
DATA_DIR = "../data/comp_data"
MODEL_DIR = "../data/models"
WORD2VEC_BIN = "GoogleNews-vectors-negative300.bin.gz"
WORD2VEC_EMBED_SIZE = 300
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"
STORY_FILE = "studystack_qa_cleaner_no_qm.txt"
QA_EMBED_SIZE = 64
BATCH_SIZE = 32
NBR_EPOCHS = 20
## extract data
print("Loading and formatting data...")
qapairs = kaggle.get_question_answer_pairs(
os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([question_maxlen, answer_maxlen])
word2idx = kaggle.build_vocab([], qapairs, [])
vocab_size = len(word2idx) + 1 # include mask character 0
Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)
Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape,
Ytrain.shape, Ytest.shape)
# get embeddings from word2vec
# see https://github.com/fchollet/keras/issues/853
DATA_DIR = "../data/comp_data"
MODEL_DIR = "../data/models"
WORD2VEC_BIN = "GoogleNews-vectors-negative300.bin.gz"
WORD2VEC_EMBED_SIZE = 300
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"
QA_EMBED_SIZE = 64
BATCH_SIZE = 32
NBR_EPOCHS = 20
## extract data
print("Loading and formatting data...")
qapairs = kaggle.get_question_answer_pairs(
os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([question_maxlen, answer_maxlen])
word2idx = kaggle.build_vocab([], qapairs, [])
vocab_size = len(word2idx) + 1 # include mask character 0
Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)
Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape,
Ytrain.shape, Ytest.shape)
# get embeddings from word2vec
print("Loading Word2Vec model and generating embedding matrix...")
QA_EMBED_SIZE = 64
BATCH_SIZE = 128
NBR_EPOCHS = 20
## extract data
print("Loading and formatting data...")
qapairs = kaggle.get_question_answer_pairs(
os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
# Even though we don't use the test set for classification, we still need
# to consider any additional vocabulary words from it for when we use the
# model for prediction (against the test set).
tqapairs = kaggle.get_question_answer_pairs(
os.path.join(DATA_DIR, QA_TEST_FILE), is_test=True)
tq_maxlen = max([len(qapair[0]) for qapair in tqapairs])
ta_maxlen = max([len(qapair[1]) for qapair in tqapairs])
seq_maxlen = max([question_maxlen, answer_maxlen, tq_maxlen, ta_maxlen])
word2idx = kaggle.build_vocab([], qapairs, tqapairs)
vocab_size = len(word2idx) + 1 # include mask character 0
Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)
Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape,
Ytrain.shape, Ytest.shape)
# get embeddings from word2vec
DATA_DIR = "../data/comp_data"
MODEL_DIR = "../data/models"
WORD2VEC_BIN = "GoogleNews-vectors-negative300.bin.gz"
WORD2VEC_EMBED_SIZE = 300
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"
QA_EMBED_SIZE = 64
BATCH_SIZE = 32
NBR_EPOCHS = 20
## extract data
print("Loading and formatting data...")
qapairs = kaggle.get_question_answer_pairs(
os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([question_maxlen, answer_maxlen])
word2idx = kaggle.build_vocab([], qapairs, [])
vocab_size = len(word2idx) + 1 # include mask character 0
Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)
Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape,
Ytrain.shape, Ytest.shape)
# get embeddings from word2vec
# see https://github.com/fchollet/keras/issues/853
DATA_DIR = "../data/comp_data"
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"
STORY_FILE = "studystack_qa_cleaner_no_qm.txt"
STORY_WEIGHTS = "lstm-story-weights.txt"
STORY_BIAS = "lstm-story-bias.txt"
EMBED_SIZE = 64
BATCH_SIZE = 256
NBR_EPOCHS = 20
stories = kaggle.get_stories(os.path.join(DATA_DIR, STORY_FILE))
story_maxlen = max([len(words) for words in stories])
# this part is only required to get the maximum sequence length
qapairs = kaggle.get_question_answer_pairs(
os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([story_maxlen, question_maxlen, answer_maxlen])
word2idx = kaggle.build_vocab(stories, qapairs, [])
vocab_size = len(word2idx)
Xs = kaggle.vectorize_stories(stories, word2idx, seq_maxlen)
Xstrain, Xstest = train_test_split(Xs, test_size=0.3, random_state=42)
print(Xstrain.shape, Xstest.shape)
inputs = Input(shape=(seq_maxlen, vocab_size))
encoded = LSTM(EMBED_SIZE)(inputs)
decoded = RepeatVector(seq_maxlen)(encoded)
decoded = LSTM(vocab_size, return_sequences=True)(decoded)
EMBED_SIZE = 64
BATCH_SIZE = 256
NBR_EPOCHS = 20
stories = kaggle.get_stories(os.path.join(DATA_DIR, STORY_FILE))
story_maxlen = max([len(words) for words in stories])
# this part is only required to get the maximum sequence length
qapairs = kaggle.get_question_answer_pairs(
os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([story_maxlen, question_maxlen, answer_maxlen])
word2idx = kaggle.build_vocab(stories, qapairs, [])
vocab_size = len(word2idx)
Xs = kaggle.vectorize_stories(stories, word2idx, seq_maxlen)
Xstrain, Xstest = train_test_split(Xs, test_size=0.3, random_state=42)
print(Xstrain.shape, Xstest.shape)
signal = Input(shape=(seq_maxlen,))
encoded = Dense(EMBED_SIZE, init="glorot_uniform", activation="relu")(signal)
decoded = Dense(seq_maxlen, init="glorot_uniform", activation="sigmoid")(encoded)
autoencoder = Model(input=signal, output=decoded)
autoencoder.compile("adadelta", loss="binary_crossentropy")
autoencoder.fit(Xstrain, Xstrain, nb_epoch=NBR_EPOCHS, batch_size=BATCH_SIZE,
shuffle=True, validation_data=(Xstest, Xstest))
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"
QA_EMBED_SIZE = 64
BATCH_SIZE = 32
NBR_EPOCHS = 20
## extract data
print("Loading and formatting data...")
qapairs = kaggle.get_question_answer_pairs(
os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([question_maxlen, answer_maxlen])
word2idx = kaggle.build_vocab([], qapairs, [])
vocab_size = len(word2idx) + 1 # include mask character 0
Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)
Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape,
Ytrain.shape, Ytest.shape)
# get embeddings from word2vec
# see https://github.com/fchollet/keras/issues/853
print("Loading Word2Vec model and generating embedding matrix...")
word2vec = Word2Vec.load_word2vec_format(
os.path.join(DATA_DIR, WORD2VEC_BIN), binary=True)
embedding_weights = np.zeros((vocab_size, WORD2VEC_EMBED_SIZE))
for word, index in word2idx.items():
try: