Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def lemmatizeDescription(description):
descriptionWords = description.split()
newDesc = ''
mapListTuples = pos_tag(desc.split())
for m in mapListTuples:
try:
l = list(m)
word = l[0]
typ = l[1]
newDesc = newDesc + wordNetLem.lemmatize(str(word), get_wordnet_pos(str(typ)))
newDesc = newDesc + ' '
#print(newDesc)
#print(str(word), str(typ))
except Exception as e:
print(e)
return newDesc
def getSimpleQuestionKeywords(query):
browntext = brown.words()
browndist = nltk.FreqDist(browntext)
reuterstext = reuters.words()
reutersdist = nltk.FreqDist(reuterstext)
text = nltk.word_tokenize(query)
tagged = nltk.pos_tag(text)
filteredparts = []
for pair in tagged:
if pair[1] in ['FW', 'JJ', 'JJR', 'JJS', 'JJT', 'N', 'NN', 'NNP', 'NNS', 'NP', 'NPS', 'NR', 'RB', 'RBR', 'RBT' 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NUM', 'CD', 'OD']:
filteredparts.append(pair[0])
filtereddist = {}
for word in filteredparts:
frequency = browndist[word] + reutersdist[word]
if frequency < 600:
filtereddist[word] = frequency
sortedlist = sorted(filtereddist.items(), key=itemgetter(1))
keywords = []
for pair in sortedlist:
keywords.append(pair[0])
return keywords
def extractNames(li):
finList = []
## Loop through the list that has the HTML page content
for a in li:
## Tokenize the HTML text into smaller blocks of text
for send in nltk.sent_tokenize(str(a)):
smLi = []
## Tokenize the smaller blocks of text in individual words and then add a Part-of-Speech(POS) tag
for index, chunk in enumerate(nltk.pos_tag(nltk.word_tokenize(send))):
## If the POS tag is NNP (noun)
if 'NNP' in chunk[1]:
## If the each character in the word is an alphanumeric character and there are more than 2 characters in the word
if(len(' '.join(e for e in chunk[0] if e.isalnum())) > 2):
## Append the list with the index of the word, chunk that has the POS tag and the link
smLi.append([index, chunk, a[1]])
finList.append(smLi)
nameLi = []
for f in finList:
if len(f) > 0:
strName = ''
for index, i in enumerate(f):
## If strName is blank, declare it with the current word in the list
if strName == '':
strName = i[1][0]
## If index+1 is not at the end of the list, continue
def tags(text):
"""returns some text with part of speech tagging
"""
words = nltk.word_tokenize(text)
return nltk.pos_tag(words)
def process_line(s, clean_string=True):
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
#return [process_token(None,token).lower() for token in tokens]
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
def get_first_last_tuples(sentence):
first_last_tuples = []
sentenceParsed = word_tokenize(sentence)
pos = nltk.pos_tag(sentenceParsed) #Parts Of Speech
pos = [ i[1] for i in pos ] # extract the 2nd element of the POS tuples in list
n = len(pos)
first = ""
last = ""
if n > 1: # need to have three items
first = "-".join(pos[0:2]) # pull out first 2 list items
last = "-".join(pos[-2:]) # pull out last 2 list items
first_last_tuples = [first, last]
return first_last_tuples
Example:
For city entity
docs = [[('Book', 'O'), ('a', 'O'), ('flight', 'O'), ('to', 'O'), ('New', 'B'), ('York', 'I')],
[('I', 'O'), ('want', 'O'), ('to', 'O'), ('fly', 'O'), ('to', 'O'), ('California', 'B')]]
pos_tag(docs)
>> [[('Book', 'NNP', 'O'), ('a', 'DT', 'O'), ('flight', 'NN', 'O'), ('to', 'TO', 'O'),
('New', 'NNP', 'B'), ('York', 'NNP', 'I')], [('I', 'PRP', 'O'), ('want', 'VBP', 'O'),
('to', 'TO', 'O'), ('fly', 'RB', 'O'), ('to', 'TO', 'O'), ('California', 'NNP', 'B')]]
"""
data = []
for i, doc in enumerate(docs):
# Obtain the list of tokens in the document
tokens = [t for t, label in doc]
# Perform POS tagging
tagged = nltk.pos_tag(tokens)
# Take the word, POS tag, and its label
data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])
return data
def extractNames(li):
finList = []
## Loop through the list that has the HTML page content
for a in li:
## Tokenize the HTML text into smaller blocks of text
for send in nltk.sent_tokenize(str(a)):
smLi = []
## Tokenize the smaller blocks of text in individual words and then add a Part-of-Speech(POS) tag
for index, chunk in enumerate(nltk.pos_tag(nltk.word_tokenize(send))):
## If the POS tag is NNP (noun)
if 'NNP' in chunk[1]:
## If the each character in the word is an alphanumeric character and there are more than 2 characters in the word
if(len(' '.join(e for e in chunk[0] if e.isalnum())) > 2):
## Append the list with the index of the word, chunk that has the POS tag and the link
smLi.append([index, chunk, a[1]])
finList.append(smLi)
nameLi = []
for f in finList:
if len(f) > 0:
strName = ''
for index, i in enumerate(f):
## If strName is blank, declare it with the current word in the list
if strName == '':
strName = i[1][0]
## If index+1 is not at the end of the list, continue
filtered_words = [x[0] for x in pos if x[1] in ('NN', 'JJ')]
print filtered_words
print FreqDist(filtered_words).items()[:20]
print strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
# Compare the most frequent words in both sets, while ignoring stopwords
print ''
for category in movie_reviews.categories():
print 'Category', category
# I used a cut off here to speed up the process
# Remove [:1000] to analyze all reviews, but it will take ~1h to process
all_words = movie_reviews.words(categories=category)[:1000]
pos = nltk.pos_tag(all_words)
all_filtered_words = [x[0] for x in pos if x[1] in ('NN', 'NNS', 'JJ') and len(x[0]) > 1]
all_words_by_frequency = FreqDist(all_filtered_words)
print all_words_by_frequency.items()[:20]
print strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
def pattern_verb_dt_adj_noun(self, sent, jsondata):
"""
Aditya : Takes the sentence and find the chunk (matches the regex)
input : sentence
output : chuncked short sentence
"""
words = nltk.word_tokenize(sent)
tagged = nltk.pos_tag(words)
verbs = self.catch_pos(['VB','VBD','VBG','VBN','VBP','VBZ'],tagged)
nouns = self.catch_pos(['NN','NNP','NNS','NNPS'], tagged)
chunkGram = 'Chunk: {+<dt>??+}'
chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged)
# print("Here we are")
print(chunked)
# chunked.draw()
# chunked = nltk.ne_chunk(tagged)
chunk = self.tree_to_dict(chunked)
pattern_strings =[]
if len(chunk) != 0:
for chunk_no in range(len(chunk)):
pattern_string = chunk["Chunk"+str(chunk_no+1)]</dt>