Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for p in short_neg.split('\n'):
documents.append( (p, "neg") )
words = word_tokenize(p)
pos = nltk.pos_tag(words)
for w in pos:
if w[1][0] in allowed_word_types:
all_words.append(w[0].lower())
# Pickling documents.
save_documents = open("documents.pickle","wb")
pickle.dump(documents, save_documents)
save_documents.close()
# Frequency Distribution
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:5000]
save_word_features = open("word_features5k.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()
# Adjusting the feature finding function, using tokenizing by word in the document.
def find_features(document):
words = word_tokenize(document)
features = {}
for w in word_features:
features[w] = (w in words)
def make_model(stats_infile, picklefile, smoothingparam=0.001, min_freq=3, protocol=-1):
"""Train a probability model on a korp statistics file and save it as a pickle file.
The model is a LidstoneProbDist (NLTK) which has tuples (wordform, MSD-tag) as keys
and smoothed probabilities as values."""
fdist = FreqDist()
with open(stats_infile, encoding='utf-8') as f:
for line in f:
fields = line[:-1].split('\t')
word = fields[0]
# Skip word forms that occur fewer times than min_freq
if int(fields[4]) < min_freq:
break
# Get rid of all urls
if word.startswith("http://"):
continue
# # Words that only occur once may only contain letters and hyphens
# if fields[4] == '1' and any(not (c.isalpha() or c == "-") for c in word):
# continue
# if len(word) > 100:
# continue
simple_msd = fields[1][:fields[1].find('.')] if '.' in fields[1] else fields[1]
def create_features_map(cfg_filename, vocab_filename):
start = time.clock()
configuration = Utils.parse_classification_configuration(cfg_filename)
text_chunks, labels = Utils.divide_into_chunks(configuration)
countries = [entry.name for entry in configuration]
dictionary = {}
words_list = Utils.load_words_list(vocab_filename)
for country, chunk in zip(countries, text_chunks):
dcountry = {}
dist = FreqDist(chunk.split())
for word in words_list:
dcountry[word] = dist[word]
# end for
dictionary[country] = dcountry
# end for
with open('vocab.countries.pkl', 'wb') as fout:
pickle.dump(dictionary, fout, pickle.HIGHEST_PROTOCOL)
# end with
print('time:', '{0:.3f}'.format(time.clock() - start))
# end def
def display():
import pylab
# pulls in a frequency distribution of all the words in the news category
word_freqs = nltk.FreqDist(brown.words(categories='news')).most_common()
# sequentially orders the words by frequency
words_by_freq = [w for (w, _) in word_freqs]
# makes a cfd based on the words and the frequency of their tags
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
# returns a list of evenly spaced numbers from 1 to two to the power of fifteen
sizes = 2 ** pylab.arange(15)
# for every size in that evenly spaced array, evaluate a baseline tagger based on a training set of that size. so it's plotting training models that get larger and larger.
perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
pylab.plot(sizes, perfs, '-bo')
# sets all of the axes
pylab.title('Lookup Tagger Performance with Varying Model Size')
pylab.xlabel('Model Size')
pylab.ylabel('Performance')
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
def term_frequency(words):
'''
Turn a list of words into a NLTK frequency distribution object
'''
t1 = time.time()
fdist = FreqDist(words)
# remove stopwords here rather than in corpus text for speed
# http://stackoverflow.com/questions/7154312/how-do-i-remove-entries-within-a-counter-object-with-a-loop-without-invoking-a-r
for w in list(fdist):
if w in stopwords.words('english'):
del fdist[w]
t2 = time.time()
logging.debug(" create term freq: %d" % (t2-t1))
return fdist
def __init__(self):
self.uniDist = nltk.FreqDist()
self.backwardBiDist = nltk.FreqDist()
self.forwardBiDist = nltk.FreqDist()
self.trigramDist = nltk.FreqDist()
self.wordCasingLookup = {}
self.title_case_unknown_tokens = True
def ch05_10_train_test_unigram_tagger():
from nltk.corpus import brown
fd = nltk.FreqDist(brown.words(categories="news"))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys())
unigram_tagger = nltk.UnigramTagger(model=most_freq_pos)
for sent in brown.sents(categories="editorial")[:10]:
tagged = unigram_tagger.tag(sent)
print sent
print ">>>", tagged
print "not tagged: ", filter(lambda (a,b): b == None, tagged)
wordcloud3 = wc.generate_from_frequencies(words)
# from text document
import nltk
from nltk.corpus import webtext
from nltk.probability import FreqDist
from wordcloud import WordCloud
import matplotlib.pyplot as plt
#nltk.download("webtext")
import os
os.getcwd()
wt_words = webtext.words('E:/pywork/pyprojects/pyanalytics19/data/testing.txt') #sample data
data_analysis = nltk.FreqDist(wt_words)
data_analysis
filter_words = dict([(m,n) for m, n in data_analysis.items() if len(m) > 3])
filter_words
wcloud = WordCloud().generate_from_frequencies(filter_words)
plt.imshow(wcloud, interpolation ='bilinear')
plt.axis('off')
plt.show()
def ex12():
from nltk.corpus import cmudict
entries = cmudict.entries()
words = map(lambda (word, pron) : word, entries)
distinct_words = set(words)
fd = nltk.FreqDist(words)
multi_prons = 0
for key in fd.keys():
if fd[key] == 1:
break
multi_prons = multi_prons + 1
print "#-distinct words:", len(distinct_words)
print "#-words with multiple prons:", multi_prons