How to use the nltk.pos_tag function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ParasAvkirkar / -Competitive-Coding-Problem-Classifier-and-Recommender / Data Transformation / codeforces_problem / lemmatization.py View on Github external
def lemmatizeDescription(description):
    descriptionWords = description.split()
    newDesc = ''
    mapListTuples = pos_tag(desc.split())
    for m in mapListTuples:
        try:
            l = list(m)
            word = l[0]
            typ = l[1]
            newDesc = newDesc + wordNetLem.lemmatize(str(word), get_wordnet_pos(str(typ)))
            newDesc = newDesc + ' '
            #print(newDesc)
            #print(str(word), str(typ))
        except Exception as e:
            print(e)
    return newDesc
github christopherklee / trivial_pursuit / trivialpursuitfunctions.py View on Github external
def getSimpleQuestionKeywords(query):
    browntext = brown.words()
    browndist = nltk.FreqDist(browntext)

    reuterstext = reuters.words()
    reutersdist = nltk.FreqDist(reuterstext)

    text = nltk.word_tokenize(query)
    tagged = nltk.pos_tag(text)

    filteredparts = []
    for pair in tagged:
        if pair[1] in ['FW', 'JJ', 'JJR', 'JJS', 'JJT', 'N', 'NN', 'NNP', 'NNS', 'NP', 'NPS', 'NR', 'RB', 'RBR', 'RBT' 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NUM', 'CD', 'OD']:
            filteredparts.append(pair[0])

    filtereddist = {}
    for word in filteredparts:
        frequency = browndist[word] + reutersdist[word]
        if frequency < 600:
            filtereddist[word] = frequency
    sortedlist = sorted(filtereddist.items(), key=itemgetter(1))
    keywords = []
    for pair in sortedlist:
        keywords.append(pair[0])
    return keywords
github Test-BMOHB / Media-Monitoring / pyScrape_GoogleNewsPanama.py View on Github external
def extractNames(li):
    finList = []
##  Loop through the list that has the HTML page content
    for a in li:
##  Tokenize the HTML text into smaller blocks of text
        for send in nltk.sent_tokenize(str(a)):
            smLi = []
##  Tokenize the smaller blocks of text in individual words and then add a Part-of-Speech(POS) tag
            for index, chunk in enumerate(nltk.pos_tag(nltk.word_tokenize(send))):
##  If the POS tag is NNP (noun)
                if 'NNP' in chunk[1]:
##  If the each character in the word is an alphanumeric character and there are more than 2 characters in the word
                    if(len(' '.join(e for e in chunk[0] if e.isalnum())) > 2):
##  Append the list with the index of the word, chunk that has the POS tag and the link
                        smLi.append([index, chunk, a[1]])
            finList.append(smLi)
    nameLi = []
    for f in finList:
        if len(f) > 0:
            strName = ''
            for index, i in enumerate(f):
##  If strName is blank, declare it with the current word in the list
                if strName == '':
                    strName = i[1][0]
##  If index+1 is not at the end of the list, continue
github code4lib / shortimer / miner.py View on Github external
def tags(text):
    """returns some text with part of speech tagging
    """
    words = nltk.word_tokenize(text)
    return nltk.pos_tag(words)
github ryan-lowe / Ubuntu-Dialogue-Generationv2 / createDictionaries.py View on Github external
def process_line(s, clean_string=True):
    if clean_string:
        s = clean_str(s)
    tokens = tokenize(s)
    #return [process_token(None,token).lower() for token in tokens]
    sent = nltk.pos_tag(tokens)
    chunks = nltk.ne_chunk(sent, binary=False)
    return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
github vishakha-lall / MapBot / MapBotFacebook / features.py View on Github external
def get_first_last_tuples(sentence):
    first_last_tuples = []
    sentenceParsed = word_tokenize(sentence)
    pos = nltk.pos_tag(sentenceParsed) #Parts Of Speech
    pos = [ i[1] for i in pos ]  # extract the 2nd element of the POS tuples in list

    n = len(pos)
    first = ""
    last = ""

    if n > 1:  # need to have three items
        first = "-".join(pos[0:2]) # pull out first 2 list items
        last = "-".join(pos[-2:]) # pull out last 2 list items

    first_last_tuples = [first, last]
    return first_last_tuples
github hellohaptik / chatbot_ner / models / crf_v2 / Crf.py View on Github external
Example:
            For city entity
            docs = [[('Book', 'O'),  ('a', 'O'),  ('flight', 'O'),  ('to', 'O'),  ('New', 'B'),  ('York', 'I')],
                [('I', 'O'),  ('want', 'O'),  ('to', 'O'),  ('fly', 'O'),  ('to', 'O'),  ('California', 'B')]]
            pos_tag(docs)

            >> [[('Book', 'NNP', 'O'),  ('a', 'DT', 'O'),  ('flight', 'NN', 'O'),  ('to', 'TO', 'O'),
              ('New', 'NNP', 'B'),  ('York', 'NNP', 'I')], [('I', 'PRP', 'O'),  ('want', 'VBP', 'O'),
                ('to', 'TO', 'O'),  ('fly', 'RB', 'O'),  ('to', 'TO', 'O'),  ('California', 'NNP', 'B')]]
        """
        data = []
        for i, doc in enumerate(docs):
            # Obtain the list of tokens in the document
            tokens = [t for t, label in doc]
            # Perform POS tagging
            tagged = nltk.pos_tag(tokens)
            # Take the word, POS tag, and its label
            data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])
        return data
github Test-BMOHB / Media-Monitoring / pyScrape_TorontoSun.py View on Github external
def extractNames(li):
    finList = []
##  Loop through the list that has the HTML page content
    for a in li:
##  Tokenize the HTML text into smaller blocks of text
        for send in nltk.sent_tokenize(str(a)):
            smLi = []
##  Tokenize the smaller blocks of text in individual words and then add a Part-of-Speech(POS) tag
            for index, chunk in enumerate(nltk.pos_tag(nltk.word_tokenize(send))):
##  If the POS tag is NNP (noun)
                if 'NNP' in chunk[1]:
##  If the each character in the word is an alphanumeric character and there are more than 2 characters in the word
                    if(len(' '.join(e for e in chunk[0] if e.isalnum())) > 2):
##  Append the list with the index of the word, chunk that has the POS tag and the link
                        smLi.append([index, chunk, a[1]])
            finList.append(smLi)
    nameLi = []
    for f in finList:
        if len(f) > 0:
            strName = ''
            for index, i in enumerate(f):
##  If strName is blank, declare it with the current word in the list
                if strName == '':
                    strName = i[1][0]
##  If index+1 is not at the end of the list, continue
github zelandiya / KiwiPyCon-NLP-tutorial / basics / 04_filtering_by_pos.py View on Github external
filtered_words = [x[0] for x in pos if x[1] in ('NN', 'JJ')]

print filtered_words
print FreqDist(filtered_words).items()[:20]

print strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())

# Compare the most frequent words in both sets, while ignoring stopwords
print ''
for category in movie_reviews.categories():

    print 'Category', category
    # I used a cut off here to speed up the process
    # Remove [:1000] to analyze all reviews, but it will take ~1h to process
    all_words = movie_reviews.words(categories=category)[:1000]
    pos = nltk.pos_tag(all_words)
    all_filtered_words = [x[0] for x in pos if x[1] in ('NN', 'NNS', 'JJ') and len(x[0]) > 1]

    all_words_by_frequency = FreqDist(all_filtered_words)
    print all_words_by_frequency.items()[:20]

print strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
github adityasarvaiya / Automatic_Question_Generation / aqg / utils / actual_question_formation.py View on Github external
def pattern_verb_dt_adj_noun(self, sent, jsondata):
        """
        Aditya : Takes the sentence and find the chunk (matches the regex)
        input : sentence
        output : chuncked short sentence
        """
        words = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(words)
        verbs = self.catch_pos(['VB','VBD','VBG','VBN','VBP','VBZ'],tagged)
        nouns = self.catch_pos(['NN','NNP','NNS','NNPS'], tagged)

        chunkGram = 'Chunk: {+<dt>??+}'
        chunkParser = nltk.RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        # print("Here we are")
        print(chunked)
        # chunked.draw()
        # chunked = nltk.ne_chunk(tagged)
        chunk = self.tree_to_dict(chunked)
        pattern_strings =[]
        if len(chunk) != 0:
            for chunk_no in range(len(chunk)):

                pattern_string = chunk["Chunk"+str(chunk_no+1)]</dt>