Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
continue
if token in dictionary:
curline.append(dictionary[token])
else:
curline.append(1)
result.append(curline)
return result
sentences = ProcessSentence(sentences)
testsentence = ProcessSentence(testsentence)
testsentence, TestY = GetTestIndex(testsentence, 0, dftest)
dictionary, MaxTokens = GetDictionary(sentences)
TrainX = BuildNumber(sentences, dictionary)
TestX = BuildNumber(testsentence, dictionary)
TrainY = GetLabel(0, df)
TrainX = pad_sequences(TrainX, maxlen=100, value=0.)
TestX = pad_sequences(TestX, maxlen=100, value=0.)
TrainX = np.array(TrainX)
TrainY = np.array(TrainY)
TestX = np.array(TestX)
TestY = np.array(TestY)
params = {
"metrics": [],
"batch_size": 64,
"class_num": 2,
"num_epochs": 1,
"learning_rate": 1e-3,
"hidden_dimension": 128,
"sentence_len":100,
"embdding_dimension":128,
"vocab_size":45647,
# 1.load data with vocabulary of words and labels
vocabulary_word2index, vocabulary_index2word = create_voabulary()
vocab_size = len(vocabulary_word2index)
vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label()
questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
testX=[]
question_id_list=[]
for tuple in test:
question_id,question_string_list=tuple
question_id_list.append(question_id)
testX.append(question_string_list)
# 2.Data preprocessing: Sequence padding
print("start padding....")
testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length
print("end padding...")
# 3.create session.
config=tf.ConfigProto()
config.gpu_options.allow_growth=True
with tf.Session(config=config) as sess:
# 4.Instantiate Model
fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
saver=tf.train.Saver()
if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
print("Restoring Variables from Checkpoint")
saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
else:
print("Can't find the checkpoint.going to stop")
return
# 5.feed data, to get logits
output_csv_test = "fold,loss,loss_ce,loss_l2,loss_sim,loss_sub,hamming_loss,acc,prec,rec,f1,acc@k,hamming_loss@k,prec@k,rec@k,f1@k"
# start iterating over k-folds for training and testing
num_run = 0
time_train = [0]*num_runs # get time spent in training
for train, valid in zip(trainlist, validlist):
print('\n--RUN',num_run,'START--\n')
start_time_train = time.time() # staring time in training
# k-fold dataset creation
trainX, trainX_title, trainY = train
validX, validX_title, validY = valid
# Data preprocessing.Sequence padding
print("start padding & transform to one hot...")
trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length
trainX_title = pad_sequences(trainX_title, maxlen=FLAGS.sequence_length_title, value=0.)
validX = pad_sequences(validX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length
validX_title = pad_sequences(validX_title, maxlen=FLAGS.sequence_length_title, value=0.)
#with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly.
# pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f)
print("trainX[0]:", trainX[0]) ;#print("trainY[0]:", trainY[0])
#print("validX[0]:", validX[0])
# Converting labels to binary vectors
print("end padding & transform to one hot...")
saver=tf.train.Saver()
if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
print("Restoring Variables from Checkpoint")
saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
else:
print('Initializing Variables')
sess.run(tf.global_variables_initializer()) # which initialise parameters
if FLAGS.use_embedding: #load pre-trained word embedding
assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, model,num_run,word2vec_model_path=word2vec_model_path)
rnn_hidden_size = FLAGS.rnn_hidden_size
num_filters = FLAGS.num_filters
dropout_prob = FLAGS.dropout_prob
learning_rate = FLAGS.learning_rate
batch_size = FLAGS.batch_size
num_epochs = FLAGS.num_epochs
# IMDB Dataset loading
train, test, _ = imdb.load_data(path='imdb.pkl', n_words=vocab_size, valid_portion=0.1)
trainX, trainY = train
testX, testY = test
# Sequence padding
trainX = pad_sequences(trainX, maxlen=maxlen, value=0.)
testX = pad_sequences(testX, maxlen=maxlen, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
# Building network
network = input_data(shape=[None, maxlen], name='input')
network = embedding(
network,
input_dim=vocab_size,
output_dim=embedding_dim,
trainable=True)
network = bidirectional_rnn(
print("vocab_size:",vocab_size)
#iii=0
#iii/0
vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label()
questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) #TODO
test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) #TODO
testX=[]
question_id_list=[]
for tuple in test:
question_id,question_string_list=tuple
question_id_list.append(question_id)
testX.append(question_string_list)
# 2.Data preprocessing: Sequence padding
print("start padding....")
testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length
print("end padding...")
# 3.create session.
config=tf.ConfigProto()
config.gpu_options.allow_growth=True
with tf.Session(config=config) as sess:
# 4.Instantiate Model
fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
saver=tf.train.Saver()
if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
print("Restoring Variables from Checkpoint")
saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
else:
print("Can't find the checkpoint.going to stop")
return
# 5.feed data, to get logits
def padseq(data, pad=0, raw=False):
if pad == 0:
return data
elif raw:
padded_data = []
for d in data:
diff = pad - len(d)
if diff > 0:
pads = ['PAD'] * diff
d = d + pads
padded_data.append(d[:pad])
else:
padded_data.append(d[:pad])
return padded_data
else:
return tflearn.data_utils.pad_sequences(data, maxlen=pad,
dtype='int32', padding='post', truncating='post', value=0)
# 1.load data with vocabulary of words and labels
vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="transformer") # simple='simple'
vocab_size = len(vocabulary_word2index)
print("transformer.vocab_size:", vocab_size)
vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="transformer",use_seq2seq=True)
questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
testX=[]
question_id_list=[]
for tuple in test:
question_id,question_string_list=tuple
question_id_list.append(question_id)
testX.append(question_string_list)
# 2.Data preprocessing: Sequence padding
print("start padding....")
testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length
print("end padding...")
# 3.create session.
config=tf.ConfigProto()
config.gpu_options.allow_growth=True
with tf.Session(config=config) as sess:
# 4.Instantiate Model
model=Transformer(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
vocab_size, FLAGS.embed_size,FLAGS.d_model,FLAGS.d_k,FLAGS.d_v,FLAGS.h,FLAGS.num_layer,FLAGS.is_training,decoder_sent_length=FLAGS.decoder_sent_length,l2_lambda=FLAGS.l2_lambda)
saver=tf.train.Saver()
if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
print("Restoring Variables from Checkpoint")
saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
else:
print("Can't find the checkpoint.going to stop")
return
# 5.feed data, to get logits
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import numpy as np
# IMDB Dataset loading
train, test, _ = imdb.load_data(path='data/imdb.pkl', n_words=30000,
valid_portion=0.1)
trainX, trainY = train
testX, testY = test
# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=500, value=0.)
testX = pad_sequences(testX, maxlen=500, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
class IMDBDataset():
def __init__(self, X, Y):
self.num_examples = len(X)
self.inputs = X
self.tags = Y
self.ptr = 0
def minibatch(self, size):
vocabulary_word2index, vocabulary_index2word = create_voabulary()
vocab_size = len(vocabulary_word2index)
vocabulary_word2index_label,_ = create_voabulary_label()
train, test, _ = load_data(vocabulary_word2index, vocabulary_word2index_label,data_type='train')
trainX, trainY = train
testX, testY = test
print("testX.shape:", np.array(testX).shape) # 2500个list.每个list代表一句话
print("testY.shape:", np.array(testY).shape) # 2500个label
print("testX[0]:", testX[0]) # [17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
print("testX[1]:", testX[1]);
print("testY[0]:", testY[0]) # 0 ;print("testY[1]:",testY[1]) #0
# 2.Data preprocessing
# Sequence padding
print("start padding & transform to one hot...")
trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length
testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length
###############################################################################################
#with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly.
# pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f)
###############################################################################################
print("testX[0]:", testX[0]) ;print("testX[1]:", testX[1]); #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
# Converting labels to binary vectors
print("testY[0]:", testY[0]) # 0 ;print("testY[1]:",testY[1]) #0
print("end padding & transform to one hot...")
#2.create session.
config=tf.ConfigProto()
config.gpu_options.allow_growth=True
with tf.Session(config=config) as sess:
#Instantiate Model
fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
#Initialize Save
def main(_):
# 1.load data with vocabulary of words and labels
vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="rnn")
vocab_size = len(vocabulary_word2index)
vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="rnn")
questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
testX=[]
question_id_list=[]
for tuple in test:
question_id,question_string_list=tuple
question_id_list.append(question_id)
testX.append(question_string_list)
# 2.Data preprocessing: Sequence padding
print("start padding....")
testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length
print("end padding...")
# 3.create session.
config=tf.ConfigProto()
config.gpu_options.allow_growth=True
with tf.Session(config=config) as sess:
# 4.Instantiate Model
textRNN=TextRNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
vocab_size, FLAGS.embed_size, FLAGS.is_training)
saver=tf.train.Saver()
if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
print("Restoring Variables from Checkpoint for TextRNN")
saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
else:
print("Can't find the checkpoint.going to stop")
return
# 5.feed data, to get logits