How to use the statsmodels.api.families function in statsmodels

To help you get started, we’ve selected a few statsmodels examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ratschlab / RiboDiff / src / ribodiff / testcnt.py View on Github external
print '\r%i genes finished.' % num

        if opts.dispDiff and np.isnan(data.dispAdjRibo[i]):
            continue
        if not opts.dispDiff and np.isnan(data.dispAdj[i]):
            continue

        response = np.hstack([data.countRibo[i, :], data.countRna[i, :]])

        if opts.dispDiff:
            disp = np.hstack([np.repeat(data.dispAdjRibo[i], lenSampleRibo), np.repeat(data.dispAdjRna[i], lenSampleRna)])
        else:
            disp = data.dispAdj[i]

        try:
            modNB0 = sm.GLM(response, explanatory0, family=sm.families.NegativeBinomial(alpha=disp), offset=np.log(librarySizes))
            modNB1 = sm.GLM(response, explanatory1, family=sm.families.NegativeBinomial(alpha=disp), offset=np.log(librarySizes))
            result0 = modNB0.fit()
            result1 = modNB1.fit()
        except sm.tools.sm_exceptions.PerfectSeparationError:
            errorCnt += 1
        else:
            if not opts.dispDiff:
                pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance, explanatory1.shape[1] - explanatory0.shape[1])
            elif opts.dispDiff:
                pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance, (explanatory1.shape[1] - explanatory0.shape[1]) / 2.5)
            else:
                pass

    data.pval = pval

    sys.stdout.write('Warning: Failed to do test: %i genes. P value set to \'nan\'.\n' % errorCnt)
github h2oai / h2o-2 / py / testdir_single_jvm / test_GLM2_basic_cmp2.py View on Github external
n_samples, n_features = train.shape
    print "n_samples:", n_samples, "n_features:",  n_features

    print "histogram of target"
    print sp.histogram(target,3)

    print "len(train):",  len(train)
    print "len(target):", len(target)
    print "dataset shape:", dataset.shape

    if family!='gaussian':
        raise Exception("Only have gaussian logistic for scipy")

    # train the classifier
    gauss_log = sm_api.GLM(target, train, family=sm_api.families.Gaussian(sm_api.families.links.log))
    start = time.time()
    gauss_log_results = gauss_log.fit()
    print "sm_api.GLM took", time.time() - start, "seconds"
    print gauss_log_results.summary()
github pzivich / zEpid / zepid / causal / generalize / estimators.py View on Github external
"""Build the g-transport model for the outcome. This is also referred to at the Q-model.

        Parameters
        ----------
        model : str
            Variables to include in the model for predicting the outcome. Must be contained within the input
            pandas dataframe when initialized. Model form should contain the exposure, i.e. 'art + age + male'
        outcome_type : str, optional
            Variable type for the outcome. Default is binary. Also supports 'normal' and 'poisson' distributed outcomes
        print_results : bool, optional
            Whether to print the logistic regression results to the terminal. Default is True
        """
        if outcome_type == 'binary':
            linkdist = sm.families.family.Binomial()
        elif outcome_type == 'normal':
            linkdist = sm.families.family.Gaussian()
        elif outcome_type == 'poisson':
            linkdist = sm.families.family.Poisson()
        else:
            raise ValueError("Only 'binary', 'normal', and 'poisson' distributed outcomes are available")

        # Modeling the outcome
        df = self.df[self.sample].copy()
        m = smf.glm(self.outcome+' ~ '+model, df, family=linkdist)
        self._outcome_model = m.fit()

        # Printing results of the model and if any observations were dropped
        if print_results:
            print(self._outcome_model.summary())

        dfa = self.df.copy()
        dfn = self.df.copy()
github pzivich / zEpid / zepid / ICR.py View on Github external
-str object of the exposure variable name from pandas df. Ex) 'Exposure'
    modifier:
        -str object of the modifier variable name from pandas df. Ex) 'Modifier'
    adjust:
        -str object of other variables to adjust for in correct statsmodels format.
        Note: variables can NOT be named {E1M0,E0M1,E1M1} since this function creates
              variables with those names. Answers will be incorrect
         Ex) '+ C1 + C2 + C3 + Z'
    decimal:
        -Number of decimals to display in result. Default is 3
    '''
    df.loc[((df[exposure]==1)&(df[modifier]==1)),'E1M1'] = 1
    df.loc[((df[exposure]!=1)|(df[modifier]!=1)),'E1M1'] = 0
    df.loc[((df[exposure].isnull())|(df[modifier].isnull())),'E1M1'] = np.nan
    eq = outcome + '~'+exposure+'+'+modifier+'+E1M1' + adjust
    f = sm.families.family.Binomial(sm.families.links.identity)
    model = smf.glm(eq,df,family=f).fit()
    print(model.summary())
    ic = model.params['E1M1']
    print(ic)
    lcl = model.conf_int().loc['E1M1'][0]
    ucl = model.conf_int().loc['E1M1'][1]
    print('\n----------------------------------------------------------------------')
    print('Interaction Contrast')
    print('----------------------------------------------------------------------')
    print('\nIC:\t\t'+str(round(ic,decimal)))
    print('95% CI:\t\t('+str(round(lcl,decimal))+', '+str(round(ucl,decimal))+')')
    print('----------------------------------------------------------------------')
github matteosox / statcast / Scripts / post8.py View on Github external
subData['events'] = subData['events'].cat.add_categories(['Out'])

for out in outs:
    subData.loc[subData['events'] == out, 'events'] = 'Out'

subData['events'] = subData['events'].cat.remove_unused_categories()

X1 = subData.loc[:, xLabels[:-1]]
X2 = subData.loc[:, xLabels]
y1 = subData[yLabel] == 'Home Run'
y2 = subData[yLabel]

skf = StratifiedKFold(n_splits=10, shuffle=True)

glm = BetterGLM(
    SMParams={'family': sm.families.Binomial(sm.families.links.probit)})
mnLogit = BetterMNLogit()

y11p = cross_val_predict(glm, X1, y1, cv=skf, n_jobs=-1,
                         method='predict_proba')
y21p = cross_val_predict(glm, X2, y1, cv=skf, n_jobs=-1,
                         method='predict_proba')
y12p = cross_val_predict(mnLogit, X1, y2, cv=skf, n_jobs=-1,
                         method='predict_proba')
y22p = cross_val_predict(mnLogit, X2, y2, cv=skf, n_jobs=-1,
                         method='predict_proba')

# %% Log-loss

L11 = np.exp(-log_loss(y1, y11p))
L21 = np.exp(-log_loss(y1, y21p))
L12 = np.exp(-log_loss(y2, y12p))
github schatzlab / scikit-ribo / scripts / TE_modelling.py View on Github external
sys.stderr.write("[status]\tstatsmodels version:\t" + str(statsmodels.__version__) + "\n")
        # define model formula
        self.df = self.df[['ribosome_count', 'gene', 'codon', 'avgProb_scaled', 'logTPM_scaled']]
        formula = 'ribosome_count ~ C(gene) + C(codon) + avgProb_scaled'
        sys.stderr.write("[status]\tFormula: " + str(formula) + "\n")
        # define model fitting options
        solver = "IRLS"  # "lbfgs"
        tolerance = 1e-4
        numIter = 100
        sys.stderr.write("[status]\tSolver: " + solver + "\n")
        sys.stderr.write("[status]\tConvergence tolerance: " + str(tolerance) + "\n")
        sys.stderr.write("[status]\tMaxiter: " + str(numIter) + "\n")
        # model fitting NegativeBinomial GLM
        sys.stderr.write("[status]\tModel: smf.glm(formula, self.df, family=sm.families.NegativeBinomial(), "
                         "offset=self.df['logTPM_scaled']" + "\n")
        model = smf.glm(formula, self.df, family=sm.families.NegativeBinomial(), offset=self.df['logTPM_scaled'])
        res = model.fit(method=solver, tol=tolerance, maxiter=numIter)
        # print model output
        print(res.summary(), flush=True)
github pzivich / zEpid / docs / causal_comparisons.py View on Github external
g.fit(treatment='none')
    r_none = g.marginal_outcome
    rd_results.append(r_all - r_none)
    rr_results.append(r_all / r_none)


print('RD 95% CI:',np.percentile(rd_results,q=[2.5,97.5]))
print('RR 95% CI:',np.percentile(rr_results,q=[2.5,97.5]))
#IPTW
model = 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0'
df['iptw'] = ze.ipw.iptw(df,treatment='art',model_denominator=model,stabilized=True)
ind = sm.cov_struct.Independence()
f = sm.families.family.Binomial(sm.families.links.identity)
linrisk = smf.gee('dead ~ art',df['id'],df,cov_struct=ind,family=f,weights=df['iptw']).fit()
linrisk.summary()
f = sm.families.family.Binomial(sm.families.links.log)
log = smf.gee('dead ~ art',df['id'],df,cov_struct=ind,family=f,weights=df['iptw']).fit()
log.summary()
#Double-Robust
sdr = SimpleDoubleRobust(df,exposure='art',outcome='dead')
sdr.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
sdr.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
sdr.fit()
sdr.summary()
rd = []
rr = []
for i in range(500):
    dfs = df.sample(n=df.shape[0],replace=True)
    s = SimpleDoubleRobust(dfs,exposure='art',outcome='dead')
    s.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False)
    s.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False)
    s.fit()
github prophyle / prophyle / prophyle / prophyle_enet_statsmodel_abundance.py View on Github external
vec_pos = {leaf.name: i for i, leaf in enumerate(tree)}

    count_fn = '.'.join(asg_fn.split('.')[:-1]+['npy'])
    if os.path.isfile(count_fn):
        print("Loading counts from existing npy file", file=sys.stderr)
        map_counts = np.load(count_fn)
    else:
        map_counts = analyse_assignments(asg_fn, nodes2leaves, vec_pos)
        np.save(count_fn, map_counts)

    assert len(leaves) == len(map_counts), "Length of mappings different from #leaves...try to remove  and analyse assignments again using the right tree!"

    sim_mat = np.load(sim_mat_fn)
    assert len(leaves) == len(sim_mat), "Size of similarity matrix different from #leaves...have you used the right index/tree?"

    glm = sm.GLM(map_counts, sim_mat, family=sm.families.Poisson(link=sm.families.links.identity))

    glm_results = glm.fit_regularized(alpha=alpha, L1_wt=l1_ratio, refit=True)
    print(glm_results.summary())

    with open(out_fn, 'w') as out_f:
        for leaf, ab in zip(leaves, glm_results.params):
            print(leaf, ab, sep='\t', file=out_f)
github pzivich / zEpid / zepid / ipw / ipw.py View on Github external
to calculate
    
    Returns a pandas Series of calculated probabilities for all observations
    within the dataframe
    
    df:
        -Dataframe for the model
    model:
        -Model to fit the logistic regression to. Example) 'y ~ var1 + var2'
    mresult:
        -Whether to print the logistic regression results. Default is True
    
    Example)
    >>>zepid.ipw.propensity_score(df=data,model='X ~ Z + var1 + var2')
    '''
    f = sm.families.family.Binomial(sm.families.links.logit) 
    log = smf.glm(model,df,family=f).fit()
    if mresult == True:
        print('\n----------------------------------------------------------------')
        print('MODEL: '+model)
        print('-----------------------------------------------------------------')
        print(log.summary())
    p = log.predict(df)
    return p
github statsmodels / statsmodels / statsmodels / examples / es_misc_poisson2.py View on Github external
self = Dummy()

# generate artificial data
np.random.seed(98765678)
nobs = 200
rvs = np.random.randn(nobs,6)
data_exog = rvs
data_exog = sm.add_constant(data_exog)
xbeta = 1 + 0.1*rvs.sum(1)
data_endog = np.random.poisson(np.exp(xbeta))

#estimate discretemod.Poisson as benchmark
from statsmodels.discrete.discrete_model import Poisson
res_discrete = Poisson(data_endog, data_exog).fit()

mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson())
res_glm = mod_glm.fit()

#estimate generic MLE
self.mod = PoissonGMLE(data_endog, data_exog)
res = self.mod.fit()
offset = res.params[0] * data_exog[:,0]  #1d ???

mod1 = PoissonOffsetGMLE(data_endog, data_exog[:,1:], offset=offset)
start_params = np.ones(6)/2.
start_params = res.params[1:]
res1 = mod1.fit(start_params=start_params, method='nm', maxiter=1000, maxfun=1000)

print 'mod2'
mod2 = PoissonZiGMLE(data_endog, data_exog[:,1:], offset=offset)
start_params = np.r_[np.ones(6)/2.,10]
start_params = np.r_[res.params[1:], 20.] #-100]