Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Description: Entropy based discretization compared to discretization with equal-frequency
# of instances in intervals
# Category: preprocessing
# Uses: wdbc.tab
# Classes: Preprocessor_discretize, EntropyDiscretization
# Referenced: o_categorization.htm
import orange
def show_values(data, heading):
for a in data.domain.attributes:
print "%s/%d: %s" % (a.name, len(a.values), reduce(lambda x, y: x + ', ' + y, [i for i in a.values]))
data = orange.ExampleTable("wdbc.tab")
print '%d features in original data set, discretized:' % len(data.domain.attributes)
data_ent = orange.Preprocessor_discretize(data, method=orange.EntropyDiscretization())
show_values(data_ent, "Entropy based discretization")
print '\nFeatures with sole value after discretization:'
for a in data_ent.domain.attributes:
if len(a.values) == 1:
print a.name
import orngDisc
data_ent2 = orngDisc.entropyDiscretization(data)
print '%d features after removing features discretized to a constant value' % len(data_ent2.domain.attributes)
# Description: Shows how to round-off the cut-off points used for categorization.
# Category: preprocessing
# Uses: iris
# Classes: EquiNDiscretization, EntropyDiscretization
# Referenced: o_categorization.htm
import orange
iris = orange.ExampleTable("iris")
equiN = orange.EquiNDiscretization(numberOfIntervals=4)
entropy = orange.EntropyDiscretization()
pl = equiN("petal length", iris)
sl = equiN("sepal length", iris)
sl_ent = entropy("sepal length", iris)
points = pl.getValueFrom.transformer.points
points2 = map(lambda x:round(x), points)
pl.getValueFrom.transformer.points = points2
for attribute in [pl, sl, sl_ent]:
print "Cut-off points for", attribute.name, \
"are", attribute.getValueFrom.transformer.points
def __init__(self, discr = orange.EntropyDiscretization(), learnr = orange.BayesLearner()):
self.disc = discr
self.learner = learnr
# Description: Entropy based discretization compared to discretization with equal-frequency
# of instances in intervals
# Category: preprocessing
# Uses: wdbc.tab
# Classes: Preprocessor_discretize, EntropyDiscretization
# Referenced: o_categorization.htm
import orange
def show_values(data, heading):
for a in data.domain.attributes:
print "%s/%d: %s" % (a.name, len(a.values), reduce(lambda x,y: x+', '+y, [i for i in a.values]))
data = orange.ExampleTable("../datasets/wdbc")
print '%d features in original data set, discretized:' % len(data.domain.attributes)
data_ent = orange.Preprocessor_discretize(data, method=orange.EntropyDiscretization())
show_values(data_ent, "Entropy based discretization")
print '\nFeatures with sole value after discretization:'
for a in data_ent.domain.attributes:
if len(a.values)==1:
print a.name
import orngDisc
reload(orngDisc)
data_ent2 = orngDisc.entropyDiscretization(data)
print '%d features after removing features discretized to a constant value' % len(data_ent2.domain.attributes)
# Description: Shows how to find out which are the cut-off points introduced by Orange's automatic categorization rutines.
# Category: preprocessing
# Uses: iris
# Classes: EquiNDiscretization, EntropyDiscretization
# Referenced: o_categorization.htm
import orange
iris = orange.ExampleTable("iris")
equiN = orange.EquiNDiscretization(numberOfIntervals=4)
entropy = orange.EntropyDiscretization()
pl = equiN("petal length", iris)
sl = equiN("sepal length", iris)
sl_ent = entropy("sepal length", iris)
for attribute in [pl, sl, sl_ent]:
print "Cut-off points for", attribute.name, \
"are", attribute.getValueFrom.transformer.points
def _prepare(self, t):
# prepares an Orange table so that it doesn't contain continuous
# attributes or missing values
### DISCRETIZE VARIABLES ###
newatt = []
oldatt = []
entroD = orange.EntropyDiscretization()
equiD = orange.EquiNDiscretization(numberOfIntervals = 2)
for i in t.domain.attributes:
if i.varType == 2:
d = entroD(i,t)
if len(d.values) < 2:
# prevent discretization into a single value
d = equiD(i,t)
d.name = 'E'+d.name
warnings.warn('Discretizing %s into %s with %d values.'%(i.name,d.name,len(d.values)))
newatt.append(d)
else:
oldatt.append(i)
if len(newatt) > 0:
t = t.select(oldatt+newatt+[t.domain.classVar])
### FIX MISSING VALUES ###
# Category: preprocessing
# Uses: iris
# Classes: EquiNDiscretization, EntropyDiscretization
# Referenced: o_categorization.htm
def printexamples(data, inxs, msg="First %i examples"):
print msg % len(inxs)
for i in inxs:
print i, data[i]
print
import orange
iris = orange.ExampleTable("iris")
equiN = orange.EquiNDiscretization(numberOfIntervals=4)
entropy = orange.EntropyDiscretization()
pl = equiN("petal length", iris)
sl = equiN("sepal length", iris)
sl_ent = entropy("sepal length", iris)
inxs = [0, 15, 35, 50, 98]
d_iris = iris.select(["sepal width", pl, "sepal length",sl, sl_ent, iris.domain.classVar])
printexamples(iris, inxs, "%i examples before discretization")
printexamples(d_iris, inxs, "%i examples before discretization")
def __call__(self, data, weight=None):
disc = orange.Preprocessor_discretize( \
data, method=orange.EntropyDiscretization())
model = orange.BayesLearner(disc, weight)
return Classifier(classifier = model)
for attr in newattrs:
print " "*17 + "cutoffs at " + ", ".join(["%5.3f" % x for x in attr.getValueFrom.transformer.points])
print
print "\nManual construction of EquiDistDiscretizer - all attributes"
edisc = orange.EquiDistDiscretizer(firstCut = 2.0, step = 1.0, numberOfIntervals = 5)
newattrs = [edisc.constructVariable(attr) for attr in data.domain.attributes]
data2 = data.select(newattrs + [data.domain.classVar])
for ex in data2[:10]:
print ex
print "\nFayyad-Irani discretization"
entro = orange.EntropyDiscretization()
for attr in data.domain.attributes:
disc = entro(attr, data)
print "%s: %s" % (attr.name, disc.getValueFrom.transformer.points)
print
newclass = orange.EnumVariable("is versicolor", values = ["no", "yes"])
newclass.getValueFrom = lambda ex, w: ex["iris"]=="Iris-versicolor"
newdomain = orange.Domain(data.domain.attributes, newclass)
data_v = orange.ExampleTable(newdomain, data)
print "\nBi-Modal discretization on binary problem"
bimod = orange.BiModalDiscretization(splitInTwo = 0)
for attr in data_v.domain.attributes:
disc = bimod(attr, data_v)
print "%s: %s" % (attr.name, disc.getValueFrom.transformer.points)