Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def reply(self, paragraph, question):
input_paragraph_seq = []
input_question_seq = []
input_paragraph_emb = []
input_question_emb = []
input_paragraph_text = paragraph.lower()
input_question_text = question.lower()
for word in nltk.word_tokenize(input_paragraph_text):
if not in_white_list(word):
continue
emb = self.glove_model.encode_word(word)
input_paragraph_emb.append(emb)
for word in nltk.word_tokenize(input_question_text):
if not in_white_list(word):
continue
emb = self.glove_model.encode_word(word)
input_question_emb.append(emb)
input_paragraph_seq.append(input_paragraph_emb)
input_question_seq.append(input_question_emb)
input_paragraph_seq = pad_sequences(input_paragraph_seq, self.max_encoder_paragraph_seq_length)
input_question_seq = pad_sequences(input_question_seq, self.max_encoder_question_seq_length)
states_value = self.encoder_model.predict([input_paragraph_seq, input_question_seq])
target_seq = np.zeros((1, 1, self.num_decoder_tokens))
target_seq[0, 0, self.target_word2idx['START']] = 1
def get_speaker_info_from_transcript(proscript, scriptfile):
script_data_list, script_speaker_data, script_data = read_movie_transcript(scriptfile)
index_segment = 0
index_script = 0
last_matched_script_index = 0
while index_segment < proscript.get_no_of_segments() and index_script < len(script_data_list):
curr_seg = proscript.segment_list[index_segment]
entry_segment_list = nltk.word_tokenize(curr_seg.transcript.translate(PUNCTUATION_TRANS).lower())
entry_script_list = script_data_list[index_script]
#print("seg:%s"%entry_segment_list)
#print("scr:%s"%entry_script_list)
intersecting = get_list_intersection(entry_segment_list, entry_script_list)
no_of_intersecting = len(intersecting)
#print("%i/%i intersects"%(no_of_intersecting, len(entry_segment_list)))
meh = False
if no_of_intersecting >= len(entry_segment_list) * SCRIPT_MATCH_THRESHOLD:
#print("match")
#print("seg(%i):%s\nscr(%i):%s"%(index_segment, curr_seg.transcript, index_script, script_data[index_script]))
curr_seg.speaker_id = script_speaker_data[index_script]
remove_list_from_list(entry_script_list, intersecting)
script_data_list[index_script] = entry_script_list
index_segment += 1
def shorttext_to_embedvec(self, shorttext):
""" Convert the short text into an averaged embedded vector representation.
Given a short sentence, it converts all the tokens into embedded vectors according to
the given word-embedding model, sums
them up, and normalize the resulting vector. It returns the resulting vector
that represents this short sentence.
:param shorttext: a short sentence
:return: an embedded vector that represents the short sentence
:type shorttext: str
:rtype: numpy.ndarray
"""
vec = np.zeros(self.vecsize)
tokens = word_tokenize(shorttext)
for token in tokens:
if token in self.wvmodel:
vec += self.wvmodel[token]
vec /= np.linalg.norm(vec)
return vec
def doc_ir(data=list(),edocs=edict(),best=5,model=None):
"""
Returns a dictionary of n best document titles for each claim.
"""
rdocs=dict()
for example in tqdm(data):
claim=example["claim"]
titles=find_titles_in_claim(claim,edocs)
ctoks=word_tokenize(claim.lower())
rdocs[example["id"]]=(titles,ctoks)
t2tf=titles_to_tf()
doctf=load_doc_tf(rdocs,t2tf)
docs=dict()
for example in tqdm(data):
titles,ctoks=rdocs[example["id"]]
tscores=best_titles(example["claim"],ctoks,titles,doctf,best,model)
docs[example["id"]]=tscores
return docs
def process(self, text: str) -> DocBase:
tokens = nltk.word_tokenize(text)
return ArrayDoc(text, tokens)
# Note that the gutenberg fileids only have a small subset of text compared
# to the large amount of content found on Project Gutenberg.
# If you wish to process a text from Project Gutenberg accessed via the web,
# one may use the urllib module to import via the internet.
from urllib.request import urlopen
# This URL corresponds to "The Picture of Dorian Grey" by Oscar Wilde.
url = "https://www.gutenberg.org/cache/epub/174/pg174.txt"
raw = urlopen(url).read().decode('utf-8')
# Once the raw content has been extracted, we convert this content to something
# that NLTK can understand and process. This should look somewhat familiar if
# you have consulted Part 1 of this tutorial.
dorian_grey = nltk.Text(nltk.word_tokenize(raw))
# Once the text has been converted to an NLTK Text object, we can process it
# just like we have been doing previously. For example, here we convert the
# text object to a frequency distribution and calculate the hapaxes.
fdist_dorian = nltk.FreqDist(dorian_grey)
print(fdist_dorian.hapaxes())
# The above approach is not limited to text from Project Gutenberg, but is
# broadly applicable to any text that can be obtained from a direct URL.
# Let us consider another text resource that NLTK allows us to process. One of
# them is various web and chat data. The first one we shall focus on his
# web text.
# We can print out the file ids of the webtext collection to see what is provided:
for file_id in nltk.corpus.webtext.fileids():
# Try to pattern match to one of the known queries
(corpus_index, similarity) = wd.find_corpus(question, Q_corpora)
if similarity < .25:
# Unable to match to one of the templates
results_dict["corpus_index"] = None
results_dict["terms"] = None
results_dict["error_code"] = "not_understood"
results_dict[
"error_message"] = "Sorry, I was unable to interpret your question. The nearest similar question I can answer is:\n %s" % \
Q_corpora[corpus_index][wd.max_in_corpus(question, Q_corpora[corpus_index])[0]]
return results_dict
# get every contiguous sub-block in the query
blocks = []
question_tokenized = nltk.word_tokenize(question, "english")
for block_size in range(1, len(question_tokenized)):
for i in range(len(question_tokenized) - block_size + 1):
block = " ".join(question_tokenized[i:(i + block_size)])
blocks.append(block)
blocks = list(reversed(blocks)) # go bigger to smaller since "is_assoc_with" \subst "gene_assoc_with" after stopword deletion
# for each block, look for the associated terms in a greedy fashion
#######################################################################
# Q3: What are the protein targets of naproxen?
#######################################################################
if corpus_index == 3: # Q3
# Greedy look for drug name TODO: in the future, may need to disambiguate terms like I did for other Q's
# with candidate_node_names
source_name = None
target_label = None
relationship_type = None
def reply(self, paragraph, question):
input_paragraph_seq = []
input_question_seq = []
input_paragraph_wid_list = []
input_question_wid_list = []
input_paragraph_text = paragraph.lower()
input_question_text = question.lower()
for word in nltk.word_tokenize(input_paragraph_text):
if not text_utils.in_white_list(word):
continue
idx = 1 # default [UNK]
if word in self.input_paragraph_word2idx:
idx = self.input_paragraph_word2idx[word]
input_paragraph_wid_list.append(idx)
for word in nltk.word_tokenize(input_question_text):
if not text_utils.in_white_list(word):
continue
idx = 1 # default [UNK]
if word in self.input_question_word2idx:
idx = self.input_question_word2idx[word]
input_question_wid_list.append(idx)
input_paragraph_seq.append(input_paragraph_wid_list)
input_question_seq.append(input_question_wid_list)
input_paragraph_seq = pad_sequences(input_paragraph_seq, self.max_encoder_paragraph_seq_length)
def candidate_entities_inverted_index(self, entity: str) -> List[Tuple[str]]:
word_tokens = nltk.word_tokenize(entity)
candidate_entities = []
for tok in word_tokens:
if len(tok) > 1:
found = False
if tok in self.inverted_index:
candidate_entities += self.inverted_index[tok]
found = True
morph_parse_tok = self.morph.parse(tok)[0]
lemmatized_tok = morph_parse_tok.normal_form
if lemmatized_tok != tok and lemmatized_tok in self.inverted_index:
candidate_entities += self.inverted_index[lemmatized_tok]
found = True
if not found:
words_with_levens_1 = self.searcher.search(tok, d=1)
for word in words_with_levens_1:
def getTokenisedScentence(self, inSentence):
return nltk.pos_tag(nltk.word_tokenize(inSentence))