Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def preprocess(text):
"""
Preprocess text for encoder
"""
X = []
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
console.log("Loaded NLTK data")
for t in text:
sents = sent_detector.tokenize(t)
result = ''
for s in sents:
tokens = word_tokenize(s)
result += ' ' + ' '.join(tokens)
X.append(result)
return X
def _build_para_dict(self):
path = "data/ppdb-2.0-s-all"
lines = read_lines(path)
relations = [line.split(" ||| ")[-1] for line in lines]
equivalent_pairs = []
print("Preprocessing raw data...")
for line in tqdm(lines):
split = line.split(" ||| ")
if split[-1] == "Equivalence":
equivalent_pairs.append(tuple(split[1:3]))
paraphrase_pairs = [line.split(" ||| ")[1:3] for line in lines]
equivalent_pairs_ubuntu = []
print("Extracting paraphrase pairs...")
for pair in tqdm(equivalent_pairs):
tokens_0 = word_tokenize(pair[0])
tokens_1 = word_tokenize(pair[1])
if not (self._contains_unknown(tokens_0) or self._contains_unknown(tokens_1)):
equivalent_pairs_ubuntu.append(
(tokens_0, tokens_1))
# Insert paraphrases in both directions
print("Building dictionary...")
self.paraphrase_dict = {}
for (p0, p1) in tqdm(equivalent_pairs_ubuntu):
p0 = tuple(p0)
p1 = tuple(p1)
try:
self.paraphrase_dict[p0] = self.paraphrase_dict[p0] + [p1]
except:
self.paraphrase_dict[p0] = [p1]
def process_text(self, input_text):
token_text = word_tokenize(str(input_text))
return token_text
#for a in answer:
#answer_start = int(a['answer_start'])
# will have to update with some kind of span prediction using rouge-L
answer_start = randint(0,int(len(passage_concat)*8/10))
#add '.' here, just because NLTK is not good enough in some cases
answer_words = word_tokenize(answer_1 + '.')
if answer_words[-1] == '.':
answer_words = answer_words[:-1]
else:
answer_words = word_tokenize(answer_1)
#word level
prev_context_words = word_tokenize( passage_concat[:answer_start] )
left_context_words = word_tokenize( passage_concat[answer_start:] )
pos_list = []
for i in range(len(answer_words)):
if i < len(left_context_words):
pos_list.append(len(prev_context_words) + i)
#assert(len(pos_list) > 0)
if(len(pos_list) == 0):
print(answer_words)
print(answer)
print(ab)
print(question)
assert(False)
# sent level
# [sent_idx, word_idx]
for idx, sent in enumerate(passage_sent):
if sublist_exists(answer_words, sent):
def get_latitude(self, user_input):
"""
Returns the latitude extracted from the input.
"""
from nltk import tokenize
for token in tokenize.word_tokenize(user_input):
if 'latitude=' in token:
return re.sub('latitude=', '', token)
return ''
symbol.append('?')
for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate(
data_utils.get_batches(x_val, y_val, self.batch_size)):
answer_logits = sess.run(self.inference_logits,
feed_dict={self.input_data: input_batch,
self.source_sentence_length: source_sent_lengths,
self.keep_prob: 1.0,
self.word_dropout_keep_prob: 1.0,
self.z_temperature: self.z_temp})
for k, pred in enumerate(answer_logits):
hypotheses_val.append(
word_tokenize(
" ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, -1, self.eos]])) + symbol)
references_val.append([word_tokenize(true_val[batch_i * self.batch_size + k])])
bleu_scores = eval_utils.calculate_bleu_scores(references_val, hypotheses_val)
self.epoch_bleu_score_val['1'].append(bleu_scores[0])
self.epoch_bleu_score_val['2'].append(bleu_scores[1])
self.epoch_bleu_score_val['3'].append(bleu_scores[2])
self.epoch_bleu_score_val['4'].append(bleu_scores[3])
def clearstring(string):
string = unidecode(string)
string = re.sub('[^A-Za-z ]+', '', string)
string = word_tokenize(string)
string = filter(None, string)
string = [y.strip() for y in string]
string = ' '.join(string).lower()
return ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
def get_features(self, document):
document = re.sub('[%s]' % re.escape(string.punctuation), '', document) # removes punctuation
document = document.lower() # make everything lowercase
all_words = [w for w in word_tokenize(document) if len(w) > 3 and len(w) < 16]
p = PorterStemmer()
all_words = [p.stem(w) for w in all_words]
all_words_freq = FreqDist(all_words)
# print sorted(all_words_freq.items(), key=lambda(w,c):(-c, w))
return all_words_freq
answer_1 = answer[0].strip()
if answer_1 == [] or answer_1 == '':
answer_1 = answer[1].strip()
print(True)
else:
answer_1 = answer[0].strip()
passage_concat = ''
#for pi, p in enumerate(article["paragraphs"]):
for passage in json_line['passages']:
passage_concat += passage['passage_text']
#context = p["context"]
#context = context.replace("''", '" ')
#context = context.replace("``", '" ')
passage = word_tokenize(passage_concat)
passage_sent = sent_tokenize(passage_concat)
passage_sent = [word_tokenize(sent) for sent in passage_sent]
passages.append(passage) # word level paragraph
passages_sent.append(passage_sent) # sentence_word level paragraph
passages_original.append(passage_concat) # original paragraph
passages_original_sent.append(passage_sent) # sentence_tokenized original paragraph
for w in passage:
word_counter[w] += 1
#for qa in p["qas"]:
question = word_tokenize(json_line["query"])
answers = []
answers_sent = []
for w in question:
word_counter[w] += 1
def do_keras_lstm(text,stars):
#转换成词袋序列
max_document_length=200
#删除通用词
text_cleaned=[]
list_stopWords = list(set(stopwords.words('english')))
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
d = enchant.Dict("en_US")
for line in text:
# 分词
list_words = word_tokenize(line.lower())
# 去掉标点符号
list_words = [word for word in list_words if word not in english_punctuations]
# 实用wordnet删除非常见英文单词
#list_words = [word for word in list_words if wordnet.synsets(word) ]
list_words = [word for word in list_words if d.check(word)]
# 过滤停止词
filtered_words = [w for w in list_words if not w in list_stopWords]
text_cleaned.append( " ".join(filtered_words) )
text=text_cleaned
#设置分词最大个数 即词袋的单词个数
tokenizer = Tokenizer(num_words=max_features,lower=True)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)