Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def crossvalidate_formula(formula, data, response_column, k):
PRESS = 0
for i in range(k):
start = i * (len(data) // k)
end = (i + 1) * (len(data) // k) if i < k - 1 else len(data)
to_drop = data.index[start: end]
train = data.drop(to_drop)
test = data.loc[to_drop]
model = smf.ols(formula, train).fit()
pred = model.predict(test)
residuals = test[response_column] - pred
PRESS += (residuals ** 2).sum()
response = data[response_column]
Q2 = 1 - PRESS / ((response - response.mean()) ** 2).sum()
return Q2
candidate_split = best_candidate.split(sep=":")
if len(candidate_split) == 2:
if candidate_split[0] not in selected and candidate_split[0] in remaining:
remaining.remove(candidate_split[0])
selected.append(candidate_split[0])
maxvars += 1
if candidate_split[1] not in selected and candidate_split[1] in remaining:
remaining.remove(candidate_split[1])
selected.append(candidate_split[1])
maxvars += 1
remaining.remove(best_candidate)
selected.append(best_candidate)
current_score = best_new_score
formula = "{} ~ {} + 1".format(response,
' + '.join(selected))
model = smf.ols(formula, data).fit()
return model
def _get_best_configs(self, parameters, results, configs, lower_is_better,
alpha=0.05):
"""
Implements the testing procedure itself and returns the reduced set
of parameter configurations.
"""
df = self._prep_df_for_linreg(parameters, results,
configs, lower_is_better)
l = 1
h = df.Rank.max()
p = h
while l != h:
lm = ols('Objective ~ C(Rank)', data=df.loc[df.Rank <= p, :]).fit()
p_value = sm.stats.anova_lm(lm, typ=2).loc[:, "PR(>F)"].ix["C(Rank)"]
reject = p_value < alpha
if reject:
h = p - 1
else:
l = p
p = math.ceil((l + h) / 2)
return df.loc[df.Rank <= p, :].loc[:,
[p.name for p in parameters]].drop_duplicates().to_dict(
'records')
def calculateDifferenceInDifferences(self, df):
#dropna is necessary to remove non-pre and post values.
reg = smf.ols('kpi ~ post + treat + treat*post', data = df.dropna()).fit()
differenceInDifferencesEstimateIndex = 3
return {
"estimate": reg.params[differenceInDifferencesEstimateIndex],
"confidence interval": list((reg._results.conf_int(alpha=0.05, cols=[differenceInDifferencesEstimateIndex]))[0])
}
weekday_df = self.add_cdd(weekday_df)
weekday_df = self.add_hdd(weekday_df)
weekend_df = train_df.loc[train_df['day_of_week'].isin(self.weekends)]
weekend_df = self.add_hdd(weekend_df)
weekend_df = self.add_cdd(weekend_df)
try:
self.model_weekday = smf.ols(formula=self.formula, data=weekday_df)
self.model_res_weekday = self.model_weekday.fit()
except ValueError:
self.model_weekday = None
self.model_res_weekday = None
try:
self.model_weekend = smf.ols(formula=self.formula, data=weekend_df)
self.model_res_weekend = self.model_weekend.fit()
except ValueError:
self.model_weekend = None
self.model_res_weekend = None
params = {
"coefficients": self.model_res_weekday.params.to_dict(),
"coefficients_weekend": self.model_res_weekend.params.to_dict(),
"formula": self.formula,
"cdd_bp": self.cdd_base_temp,
"hdd_bp": self.hdd_base_temp,
"X_design_info": ''
}
weekday_model_stats = self.get_model_stats(self.model_res_weekday, weekday_df)
weekend_model_stats = self.get_model_stats(self.model_res_weekend, weekend_df)
fig.subplots_adjust(wspace=0.0)
title = '{0} Conv2D Layer {1}'.format(name, layer_id+1)
plt.suptitle(title, x=0.55, y=1.0)
if not os.path.exists('./feat_plots'):
os.mkdir('feat_plots')
plt.savefig('./feat_plots/{0}.png'.format(title))
#fig.savefig("foo.pdf", bbox_inches='tight')
plt.clf()
anova_all = np.array(anova_all)
df = pd.DataFrame(data=anova_all[1:,1:],index=anova_all[1:,0].tolist(),columns=anova_all[0,1:].tolist())
df.colums = ['id', 'y', 'layer_id', 'is_sparse']
df = df.astype({'y' : 'float32', 'layer_id' : 'int32', 'is_sparse' : 'int32'})
formula = 'y ~ C(layer_id) + C(is_sparse) + C(layer_id)*C(is_sparse)'
model = ols(formula, df).fit()
aov_table = anova_lm(model, typ=1)
eta_squared(aov_table)
omega_squared(aov_table)
print(aov_table)
clustergrp = merged_train.groupby('cluster').mean()
print('\nClustering variable means by cluster:')
print(clustergrp)
# _________validate clusters in training data by examining cluster differences
# in wine quality (validation variable) using ANOVA_____________
# merge wine quality with clustering variables and cluster assignment data
qual = wine_set['quality']
# split quality data into train and test sets
qual_train, qual_test = train_test_split(qual, test_size=.3, random_state=123)
qual_train1 = pd.DataFrame(qual_train)
qual_train1.reset_index(level=0, inplace=True)
merged_train_all = pd.merge(qual_train1, merged_train, on='index')
sub1 = merged_train_all[['quality', 'cluster']]
mod = smf.ols(formula='quality ~ C(cluster)', data=sub1).fit()
print(mod.summary())
print('\nMeans for wine quality by cluster:')
print(sub1.groupby('cluster').mean())
print('\nStandard deviations for wine quality by cluster:')
print(sub1.groupby('cluster').std())
# perform Post hoc test (using Tukey's Honestly Significant Difference Test)
mc1 = multi.MultiComparison(sub1['quality'], sub1['cluster'])
res1 = mc1.tukeyhsd()
print(res1.summary())
df=pd.DataFrame(tSet, columns=CN_rec._fields)
#exclude from training data, DELs and DUPs with CN in the tails of the distribution
df.loc[:,'q_low']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(lowQuantile)
df.loc[:,'q_high']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(highQuantile)
df=df[(df.log2r>=df.q_low) & (df.log2r<=df.q_high)]
#df.to_csv('./train.csv')
#adjust copy number for small deletions (<1kb), no strong relationship b/w cn and size for dups evident so far
small_het_dels = df[(df.svtype=="DEL") & (df.GT=="0/1") & (df.svlen<1000) & (df.svlen>=50)].copy()
small_hom_dels = df[(df.svtype=="DEL") & (df.GT=="1/1") & (df.svlen<1000) & (df.svlen>=50)].copy()
het_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="0/1") & (df.svtype=="DEL")]['log2r'])
hom_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="1/1") & (df.svtype=="DEL")]['log2r'])
small_het_dels.loc[:,'offset']=small_het_dels.loc[:,'log2r']-het_del_mean
small_hom_dels.loc[:,'offset']=small_hom_dels.loc[:,'log2r']-hom_del_mean
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
hom_del_fit=smf.ols('offset~log_len',small_hom_dels).fit()
het_del_fit=smf.ols('offset~log_len',small_het_dels).fit()
#print hom_del_fit.summary()
#print het_del_fit.summary()
small_hom_dels.loc[:,'log2r_adj'] = small_hom_dels.loc[:,'log2r'] - hom_del_fit.predict(small_hom_dels)
small_het_dels.loc[:,'log2r_adj'] = small_het_dels.loc[:,'log2r'] - het_del_fit.predict(small_het_dels)
small_dels=small_hom_dels.append(small_het_dels)
small_dels=small_dels[['var_id', 'sample', 'svtype', 'svlen', 'AF', 'GT', 'CN', 'log_len', 'log2r', 'q_low', 'q_high', 'log2r_adj']]
# dels of length<100 bp are excluded here
df1=df.loc[(df.svtype!="DEL") | (df.GT=="0/0") | (df.svlen>=1000), :].copy()
df1.loc[:,'log2r_adj']=df1.loc[:,'log2r']
df1=df1.append(small_dels)
params=df1.groupby(['sample', 'svtype', 'GT'])['log2r_adj'].aggregate([np.mean,np.var, len]).reset_index()
params=pd.pivot_table(params, index=['sample', 'svtype'], columns='GT', values=['mean', 'var', 'len']).reset_index()
params.columns=['sample', 'svtype', 'mean0', 'mean1', 'mean2', 'var0', 'var1', 'var2', 'len0', 'len1', 'len2']
params['std_pooled'] = np.sqrt((params['var0']*params['len0']+params['var1']*params['len1']+params['var2']*params['len2'])/(params['len0']+params['len1']+params['len2']))
#params.to_csv('./params.csv')
min_lm = ols('JPERF ~ TEST', data=minority_table).fit()
print min_lm.summary()
#
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF')
for factor, group in factor_group:
ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
marker=markers[factor], s=12**2)
ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left')
fig = abline_plot(model_results = min_lm, ax=ax)
#
min_lm2 = ols('JPERF ~ TEST + TEST:ETHN', data=minority_table).fit()
print min_lm2.summary()
#
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF')
for factor, group in factor_group:
ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
marker=markers[factor], s=12**2)
fig = abline_plot(intercept = min_lm2.params['Intercept'],
slope = min_lm2.params['TEST'], ax=ax, color='purple')
ax = fig.axes[0]
fig = abline_plot(intercept = min_lm2.params['Intercept'],
slope = min_lm2.params['TEST'] + min_lm2.params['TEST:ETHN'],
ax=ax, color='green')