Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def general_logistic_regression():
'''Example General Logistic Recression,
Example 7.4.1, p. 135'''
# Get the data
inFile = r'GLM_data/Table 7.5 Embryogenic anthers.xls'
df = get_data(inFile)
# Define the variables so that they match Dobson
df['n_y'] = df['n'] - df['y']
df['newstor'] = df['storage']-1
df['x'] = np.log(df['centrifuge'])
# Model 1
model1 = smf.glm('n_y + y ~ newstor*x', data=df, family=sm_families.Binomial()).fit()
print(model1.summary())
# Model 2
model2 = smf.glm('n_y + y ~ newstor+x', data=df, family=sm_families.Binomial()).fit()
print(model2.summary())
# Model 3
model3 = smf.glm('n_y + y ~ x', data=df, family=sm_families.Binomial()).fit()
print(model3 .summary())
inFile = r"GLM_data/Table 9.1 British doctors' smoking and coronary death.xls"
df = get_data(inFile)
print(df)
# Generate the required variables
df['smoke'] = np.zeros(len(df))
df['smoke'][df['smoking']=='smoker']=1
df['agecat'] = np.array([1,2,3,4,5,1,2,3,4,5])
df['agesq'] = df['agecat']**2
df['smkage'] = df['agecat']
df['smkage'][df['smoking']=='non-smoker']=0
model = smf.glm('deaths~agecat+agesq+smoke+smkage',
family=sm_families.Poisson(), data=df,
exposure=df["person-years"]).fit()
print(model.summary())
vb01 = cov_matrix.loc['E0M1']['E0M1']
vb11 = cov_matrix.loc['E1M1']['E1M1']
cvb10_01 = cov_matrix.loc['E1M0']['E0M1']
cvb10_11 = cov_matrix.loc['E1M0']['E1M1']
cvb01_11 = cov_matrix.loc['E0M1']['E1M1']
varICR = (((em10**2)*vb10)+((em01**2)*vb01)+((em11**2)*vb11)+((em10*em01*2*cvb10_01))+(-1*em10*em11*2*cvb10_11)+(-1*em01*em11*2*cvb01_11))
icr_lcl = icr - zalpha*math.sqrt(varICR)
icr_ucl = icr + zalpha*math.sqrt(varICR)
elif ci == 'bootstrap':
bse_icr = []
ul = 1 - alpha/2
ll = 0 + alpha/2
for i in range(b_sample):
dfs = df.sample(n=len(df),replace=True)
try:
bmodel = smf.glm(eq,dfs,family=f).fit()
em_bexpect = math.exp(bmodel.params['E1M0']) + math.exp(bmodel.params['E0M1']) - 1
bicr = math.exp(bmodel.params['E1M1']) - em_bexpect
sigma = bicr - icr
bse_icr.append(sigma)
except:
bse_icr.append(np.nan)
bsdf = pd.DataFrame()
bsdf['sigma'] = bse_icr
lsig,usig = bsdf['sigma'].dropna().quantile(q=[ll,ul])
icr_lcl = lsig + icr
icr_ucl = usig + icr
else:
raise ValueError('Please specify a supported confidence interval type')
print('\n----------------------------------------------------------------------')
if regression == 'logit':
print('ICR based on Odds Ratio\t\tAlpha = '+str(alpha))
pandas dataframe when initialized. Model form should contain the exposure, i.e. 'art + age + male'
print_results : bool, optional
Whether to print the logistic regression results to the terminal. Default is True
"""
if self.outcome_type == 'binary':
linkdist = sm.families.family.Binomial()
elif self.outcome_type == 'normal':
linkdist = sm.families.family.Gaussian()
elif self.outcome_type == 'poisson':
linkdist = sm.families.family.Poisson()
else:
raise ValueError("Only 'binary', 'normal', and 'poisson' distributed outcomes are available")
# Modeling the outcome
if self.weight is None:
m = smf.glm(self.outcome+' ~ '+model, self.sample, family=linkdist)
self._outcome_model = m.fit()
else:
m = smf.glm(self.outcome+' ~ '+model, self.sample, family=linkdist,
freq_weights=self.sample[self.weight])
self._outcome_model = m.fit()
# Printing results of the model and if any observations were dropped
if print_results:
print(self._outcome_model.summary())
glm = smf.glm(
'affairs ~ rate_marriage + yrs_married',
data=dc,
family=sm.families.Poisson(),
freq_weights=np.asarray(dc['freq']))
res_f2 = glm.fit()
#print(res_f2.summary())
res_f2.pearson_chi2 - res_f.pearson_chi2, res_f2.deviance - res_f.deviance, res_f2.llf - res_f.llf
# #### aggregated data: ``exposure`` and ``var_weights``
#
# Note: LR test agrees with original observations, ``pearson_chi2``
# differs and has the wrong sign.
glm = smf.glm(
'affairs_sum ~ rate_marriage + yrs_married',
data=df_a,
family=sm.families.Poisson(),
exposure=np.asarray(df_a['affairs_count']))
res_e2 = glm.fit()
res_e2.pearson_chi2 - res_e.pearson_chi2, res_e2.deviance - res_e.deviance, res_e2.llf - res_e.llf
glm = smf.glm(
'affairs_mean ~ rate_marriage + yrs_married',
data=df_a,
family=sm.families.Poisson(),
var_weights=np.asarray(df_a['affairs_count']))
res_a2 = glm.fit()
res_a2.pearson_chi2 - res_a.pearson_chi2, res_a2.deviance - res_a.deviance, res_a2.llf - res_a.llf
# ### Investigating Pearson chi-square statistic
'affairs_sum ~ rate_marriage + age + yrs_married',
data=df_a,
family=sm.families.Poisson(),
exposure=np.asarray(df_a['affairs_count']))
res_e = glm.fit()
print(res_e.summary())
res_e.pearson_chi2 / res_e.df_resid
# #### using var_weights
#
# We can also use the mean of all combined values of the dependent
# variable. In this case the variance will be related to the inverse of the
# total exposure reflected by one combined observation.
glm = smf.glm(
'affairs_mean ~ rate_marriage + age + yrs_married',
data=df_a,
family=sm.families.Poisson(),
var_weights=np.asarray(df_a['affairs_count']))
res_a = glm.fit()
print(res_a.summary())
# ### Comparison
#
# We saw in the summary prints above that ``params`` and ``cov_params``
# with associated Wald inference agree across versions. We summarize this in
# the following comparing individual results attributes across versions.
#
# Parameter estimates `params`, standard errors of the parameters `bse`
# and `pvalues` of the parameters for the tests that the parameters are
# zeros all agree. However, the likelihood and goodness-of-fit statistics,
self._out_model = self.outcome + ' ~ ' + model
if self._continuous_outcome:
self._continuous_type = continuous_distribution
if (continuous_distribution == 'gaussian') or (continuous_distribution == 'normal'):
f = sm.families.family.Gaussian()
elif continuous_distribution == 'poisson':
f = sm.families.family.Poisson()
else:
raise ValueError("Only 'gaussian' and 'poisson' distributions are supported")
else:
f = sm.families.family.Binomial()
if self._weight_ is None:
log = smf.glm(self._out_model, self.df, family=f).fit()
else:
log = smf.glm(self._out_model, self.df, freq_weights=self.df[self._weight_], family=f).fit()
if print_results:
print('\n----------------------------------------------------------------')
print('MODEL: ' + self._out_model)
print('-----------------------------------------------------------------')
print(log.summary())
# Generating predictions for observed variables
self._predicted_y_ = log.predict(self.df)
# Predicting under treatment strategies
dfx = self.df.copy()
dfx[self.exposure] = 1
self.df['_pY1_'] = log.predict(dfx)
Whether to print the logistic regression results. Default is True
Returns
-------------
Fitted statsmodels GLM object
Example
------------
>>>from zepid import load_sample_data
>>>from zepid.causal.utils import propensity_score
>>>df = load_sample_data(timevary=False)
>>>propensity_score(df=df,model='dead ~ art0 + male + dvl0')
"""
f = sm.families.family.Binomial()
if weights is None:
log = smf.glm(model, df, family=f).fit()
else:
log = smf.glm(model, df, freq_weights=df[weights], family=f).fit()
if print_results:
print('\n----------------------------------------------------------------')
print('MODEL: ' + model)
print('-----------------------------------------------------------------')
print(log.summary())
return log
'''
rf = df.copy()
rf = rf.dropna().sort_values(by=[var,outcome]).reset_index()
print('Warning: missing observations of model variables are dropped')
print(int(len(df)-len(rf)),' observations were dropped from the functional form assessment')
if f_form == None:
f_form = var
else:
pass
if link_dist == None:
link_dist = sm.families.family.Binomial(sm.families.links.logit)
else:
pass
if (loess == True) | (points == True):
if outcome_type=='binary':
djm = smf.glm(outcome+'~ C('+var+')',rf,family=link_dist).fit()
djf = djm.get_prediction(rf).summary_frame()
dj = pd.concat([rf,djf],axis=1)
dj.sort_values(var,inplace=True)
if points == True:
pf = dj.groupby(by=[var,'mean']).count().reset_index()
if loess == True:
yl = lowess(list(dj['mean']),list(dj[var]),frac=loess_value)
lowess_x = list(zip(*yl))[0]
lowess_y = list(zip(*yl))[1]
elif outcome_type=='continuous':
if points == True:
pf = rf.groupby(by=[var,outcome]).count().reset_index()
if loess == True:
yl = lowess(list(rf[outcome]),list(rf[var]),frac=loess_value)
lowess_x = list(zip(*yl))[0]
lowess_y = list(zip(*yl))[1]
# Toss a six-sided die 5 times, what's the probability of exactly 2 fours?
#
stats.binom(5, 1./6).pmf(2)
#
from scipy.misc import comb
comb(5,2) * (1/6.)**2 * (5/6.)**3
#
from statsmodels.formula.api import glm
glm_mod = glm(formula, dta, family=sm.families.Binomial()).fit()
#
print glm_mod.summary()
#
# The number of trials
#
glm_mod.model.data.orig_endog.sum(1)
#
glm_mod.fittedvalues * glm_mod.model.data.orig_endog.sum(1)