Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
del heartDisease['thal']
del heartDisease['oldpeak']
heartDisease = heartDisease.replace('?', np.nan)
heartDisease.dtypes
print(heartDisease.dtypes)
heartDisease.columns
model = BayesianModel([('age', 'trestbps'), ('age', 'fbs'), ('sex', 'trestbps'), ('sex', 'trestbps'),
('exang', 'trestbps'), ('trestbps',
'heartdisease'), ('fbs', 'heartdisease'),
('heartdisease', 'restecg'), ('heartdisease', 'thalach'), ('heartdisease', 'chol')])
model.fit(heartDisease, estimator=MaximumLikelihoodEstimator)
print(model.get_cpds('age'))
print(model.get_cpds('chol'))
print(model.get_cpds('sex'))
model.get_independencies()
HeartDisease_infer = VariableElimination(model)
q = HeartDisease_infer.query(variables=['heartdisease'], evidence={'age': 28})
print(q['heartdisease'])
q = HeartDisease_infer.query(
variables=['heartdisease'], evidence={'chol': 100})
print(q['heartdisease'])
#!wget http://www.bnlearn.com/bnrepository/asia/asia.bif.gz
#!gzip -qd asia.bif.gz | rm asia.bif.gz
from pgmpy.readwrite import BIFReader
reader = BIFReader('data/asia.bif')
asia_model = reader.get_model()
asia_model.nodes()
asia_model.edges()
CPDs = asia_model.get_cpds()
# Doing exact inference using Variable Elimination
from pgmpy.inference import VariableElimination
asia_infer = VariableElimination(asia_model)
# Computing the probability of bronc given smoke.
q = asia_infer.query(variables=['bronc'], evidence={'smoke': 0})
print(q['bronc'])
'''
Sanity check.
p(A=t|T=t) = p(A=t) p(T=t|A=t) / [
p(A=t) p(T=t|A=t) + p(A=f) p(T=t|A=f)]
= 0.01 * 0.05 / (0.01 * 0.05 + 0.99 * 0.01)
= 0.0481
'''
# 0 = True. 1 = False
q = asia_infer.query(variables=['asia'], evidence={'tub': 0})
print(q['asia'])
evidence_card=[2, 2])
cpd_l = TabularCPD(variable='L', variable_card=2,
values=paramsL,
evidence=['G'],
evidence_card=[3])
cpd_s = TabularCPD(variable='S', variable_card=2,
values=paramsS,
evidence=['I'],
evidence_card=[2])
model.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s)
model.check_model()
inf_engine_ve = VariableElimination(model) # compute elim order only once
def infer_pgmpy(evidence, query):
factor = inf_engine_ve.query([query], evidence=evidence) [query]
marginal = factor.values # convert from DiscreteFactor to np array
return marginal
## Check both inference engines give same posterior marginals
evlist = []
evlist.append({})
evlist.append({'G': 0, 'D': 0})
evlist.append({'L': 0, 'D': 1, 'S': 1})
for evidence in evlist:
all_nodes = set(dag.keys())
vis_nodes = set(evidence.keys())
hid_nodes = all_nodes.difference(vis_nodes)
cpd_s = TabularCPD(variable='S', variable_card=2,
values=[[0.95, 0.2],
[0.05, 0.8]],
evidence=['I'],
evidence_card=[2])
# Associating the CPDs with the network
model.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s)
# check_model checks for the network structure and CPDs and verifies that the CPDs are correctly
# defined and sum to 1.
model.check_model()
from pgmpy.inference import VariableElimination
infer = VariableElimination(model)
# p(I=1)=0.3
print(infer.query(['I']) ['I'])
# P(I=1|G=0) = 0.6133
print(infer.query(['I'], evidence={'G': 0}) ['I'])
# P(I=1|G=0,D=0) = 0.5625
print(infer.query(['I'], evidence={'G': 0, 'D': 0}) ['I'])
# P(S=1|G=0) = 0.5099
print(infer.query(['S'], evidence={'G': 0}) ['S'])
cpd_s = TabularCPD(variable='S', variable_card=2,
values=[[0.95, 0.2],
[0.05, 0.8]],
evidence=['I'],
evidence_card=[2])
# Associating the CPDs with the network
model.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s)
# check_model checks for the network structure and CPDs and verifies that the CPDs are correctly
# defined and sum to 1.
model.check_model()
from pgmpy.inference import VariableElimination
infer = VariableElimination(model)
# p(I=1)=0.3
print(infer.query(['I']) ['I'])
# P(I=1|G=0) = 0.6133
print(infer.query(['I'], evidence={'G': 0}) ['I'])
# P(I=1|G=0,D=0) = 0.5625
print(infer.query(['I'], evidence={'G': 0, 'D': 0}) ['I'])
# P(S=1|G=0) = 0.5099
print(infer.query(['S'], evidence={'G': 0}) ['S'])
"""
from pgmpy.inference import VariableElimination
if set(data.columns) == set(self.nodes()):
raise ValueError("No variable missing in data. Nothing to predict")
elif set(data.columns) - set(self.nodes()):
raise ValueError("Data has variables which are not in the model")
data_unique = data.drop_duplicates()
missing_variables = set(self.nodes()) - set(data_unique.columns)
# pred_values = defaultdict(list)
pred_values = []
# Send state_names dict from one of the estimated CPDs to the inference class.
model_inference = VariableElimination(self)
pred_values = Parallel(n_jobs=n_jobs)(
delayed(model_inference.map_query)(
variables=missing_variables,
evidence=data_point.to_dict(),
show_progress=False,
)
for index, data_point in tqdm(
data_unique.iterrows(), total=data_unique.shape[0]
)
)
df_results = pd.DataFrame(pred_values, index=data_unique.index)
data_with_results = pd.concat([data_unique, df_results], axis=1)
return data.merge(data_with_results, how="left").loc[:, missing_variables]
cpd_w = TabularCPD(variable='W', variable_card=2,
values=[[1.0, 0.1, 0.1, 0.01],
[0.0, 0.9, 0.9, 0.99]],
evidence=['S', 'R'],
evidence_card=[2, 2])
# Associating the CPDs with the network
model.add_cpds(cpd_c, cpd_s, cpd_r, cpd_w)
# check_model checks for the network structure and CPDs and verifies that the CPDs are correctly
# defined and sum to 1.
model.check_model()
from pgmpy.inference import VariableElimination
infer = VariableElimination(model)
# p(R=1)= 0.5*0.2 + 0.5*0.8 = 0.5
print(infer.query(['R']) ['R'])
# P(R=1|W=1) = 0.7079
print(infer.query(['R'], evidence={'W': 1}) ['R'])
# P(R=1|W=1,S=1) = 0.3204
print(infer.query(['R'], evidence={'W': 1, 'S': 1}) ['R'])
97 0.417124 0.582876
98 0.488275 0.511725
99 0.407978 0.592022
"""
from pgmpy.inference import VariableElimination
if set(data.columns) == set(self.nodes()):
raise ValueError("No variable missing in data. Nothing to predict")
elif set(data.columns) - set(self.nodes()):
raise ValueError("Data has variables which are not in the model")
missing_variables = set(self.nodes()) - set(data.columns)
pred_values = defaultdict(list)
model_inference = VariableElimination(self)
for index, data_point in data.iterrows():
full_distribution = model_inference.query(
variables=missing_variables,
evidence=data_point.to_dict(),
show_progress=False,
)
states_dict = {}
for var in missing_variables:
states_dict[var] = full_distribution.marginalize(
missing_variables - {var}, inplace=False
)
for k, v in states_dict.items():
for l in range(len(v.values)):
state = self.get_cpds(k).state_names[k][l]
pred_values[k + "_" + str(state)].append(v.values[l])
return pd.DataFrame(pred_values, index=data.index)