Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __le__(self, other):
if not isinstance(other, FreqDist):
raise_unorderable_types("<=", self, other)
return set(self).issubset(other) and all(
self[key] <= other[key] for key in self
)
try:
parsetree_features["average_VP_length"] = 1.0*lenVP/totVP
except:
parsetree_features["average_VP_length"] = 0
try:
parsetree_features["average_NP_length"] = 1.0*lenNP/totNP
except:
parsetree_features["average_NP_length"] = 0
parsetree_keys += ['PP_type_prop', 'VP_type_prop', 'NP_type_prop',
'PP_type_rate', 'VP_type_rate', 'NP_type_rate',
'average_PP_length', 'average_VP_length', 'average_NP_length']
# Normalize by number of productions
num_productions = len(prod_nonlexical)
fdist = nltk.probability.FreqDist(prod_nonlexical)
for prod_rule in top_rules: # need this to ensure we always get same number of CFG features
if prod_rule in fdist:
parsetree_features[prod_rule] = 1.0 * fdist[prod_rule] / num_productions
else:
parsetree_features[prod_rule] = 0.0
parsetree_keys += [prod_rule]
return parsetree_keys, parsetree_features
def load_data(file_name):
data_dir = os.path.join("../data", "stanfordmovie")
with open(os.path.join(data_dir, file_name)) as data_file:
raw_data = json.load(data_file)
train_x, train_y = map(list, zip(*raw_data))
# train_x, train_y = map(list, zip(*raw_data[:100]))
data = []
for sentences, label in zip(train_x, train_y):
words = [w for s in sentences for w in s]
data.append((FreqDist(words), label))
return data
def gt_demo():
from nltk import corpus
emma_words = corpus.gutenberg.words('austen-emma.txt')
fd = FreqDist(emma_words)
gt = GoodTuringProbDist(fd)
sgt = SimpleGoodTuringProbDist(fd)
katz = SimpleGoodTuringProbDist(fd, 7)
print('%18s %8s %12s %14s %12s' \
% ("word", "freqency", "GoodTuring", "SimpleGoodTuring", "Katz-cutoff" ))
for key in fd:
print('%18s %8d %12e %14e %12e' \
% (key, fd[key], gt.prob(key), sgt.prob(key), katz.prob(key)))
@staticmethod
def leaf(labeled_featuresets):
label = FreqDist([label for (featureset,label)
in labeled_featuresets]).max()
return DecisionTreeClassifier(label)
def leaf(labeled_featuresets):
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
return DecisionTreeClassifier(label)
def run(self):
try:
words = self.model.CORPORA[self.name]()
from operator import itemgetter
text = filter(lambda w: len(w) > 2, words)
fd = FreqDist(tuple(text[i:i+2]) for i in range(len(text)-1))
vocab = FreqDist(text)
scored = [((w1,w2), fd[(w1,w2)] ** 3 / float(vocab[w1] * vocab[w2])) for w1, w2 in fd]
scored.sort(key=itemgetter(1), reverse=True)
self.model.collocations = list(map(itemgetter(0), scored))
self.model.notify_listeners(CORPUS_LOADED_EVENT)
except Exception as e:
print(e)
self.model.notify_listeners(ERROR_LOADING_CORPUS_EVENT)
def leaf(labeled_featuresets):
label = FreqDist(label for (featureset,label)
in labeled_featuresets).max()
return DecisionTreeClassifier(label)
@staticmethod
def train(labeled_featuresets, estimator=ELEProbDist):
"""
:param labeled_featuresets: A list of classified featuresets,
i.e., a list of tuples ``(featureset, label)``.
"""
label_freqdist = FreqDist()
feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
fnames = set()
# Count up how many times each feature value occurred, given
# the label and featurename.
for featureset, label in labeled_featuresets:
label_freqdist.inc(label)
for fname, fval in featureset.items():
# Increment freq(fval|label, fname)
feature_freqdist[label, fname].inc(fval)
# Record that fname can take the value fval.
feature_values[fname].add(fval)
# Keep a list of all feature names.
fnames.add(fname)
# If a feature didn't have a value given for an instance, then