How to use the patsy.dmatrix function in patsy

To help you get started, we’ve selected a few patsy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ceholden / yatsm / tests / regression / test_regression_transforms.py View on Github external
def test_harmonic_transform():
    x = np.arange(735688, 735688 + 100, 1)
    design = patsy.dmatrix('0 + harm(x, 1)')

    truth = np.vstack((np.cos(2 * np.pi / 365.25 * x),
                       np.sin(2 * np.pi / 365.25 * x))).T

    np.testing.assert_equal(np.asarray(design), truth)
github bambinos / bambi / bambi / models.py View on Github external
if categorical is not None:
            data = data.copy()
            cats = listify(categorical)
            data[cats] = data[cats].apply(lambda x: x.astype("category"))

        # Custom patsy.missing.NAAction class. Similar to patsy drop/raise
        # defaults, but changes the raised message and logs any dropped rows
        NA_handler = Custom_NA(dropna=self.dropna)

        # screen fixed terms
        if fixed is not None:
            if "~" in fixed:
                clean_fix = re.sub(r"\[.+\]", "", fixed)
                dmatrices(clean_fix, data=data, NA_action=NA_handler)
            else:
                dmatrix(fixed, data=data, NA_action=NA_handler)

        # screen random terms
        if random is not None:
            for term in listify(random):
                for side in term.split("|"):
                    dmatrix(side, data=data, NA_action=NA_handler)

        # update the running list of complete cases
        if NA_handler.completes:
            self.completes.append(NA_handler.completes)

        # save arguments to pass to _add()
        args = dict(
            zip(
                ["fixed", "random", "priors", "family", "link", "categorical"],
                [fixed, random, priors, family, link, categorical],
github pzivich / zEpid / zepid / causal / doublyrobust / TMLE.py View on Github external
# Step 2) Estimation under the scenarios
            dfx = self.df.copy()
            dfx[self.exposure] = 1
            self.QA1W = log.predict(dfx)
            dfx = self.df.copy()
            dfx[self.exposure] = 0
            self.QA0W = log.predict(dfx)

        # User-specified model
        else:
            # TODO need to create smart warning system
            # warnings.warn("TMLE can result in confidence intervals below nominal coverage when used with "
            #              "certain machine learning algorithms")
            self._out_model_custom = True
            data = patsy.dmatrix(model + ' - 1', cc)

            dfx = self.df.copy()
            dfx[self.exposure] = 1
            adata = patsy.dmatrix(model + ' - 1', dfx)
            dfx = self.df.copy()
            dfx[self.exposure] = 0
            ndata = patsy.dmatrix(model + ' - 1', dfx)

            self.QA1W, self.QA0W = outcome_machine_learner(xdata=np.asarray(data),
                                                           ydata=np.asarray(cc[self.outcome]),
                                                           all_a=adata, none_a=ndata,
                                                           ml_model=custom_model,
                                                           continuous=self._continuous_outcome,
                                                           print_results=print_results)

        if not bound:  # Bounding predicted probabilities if requested
github statsmodels / statsmodels / statsmodels / imputation / mice.py View on Github external
def _process_kwds(self, kwds, ix):
        kwds = kwds.copy()
        for k in kwds:
            v = kwds[k]
            if isinstance(v, PatsyFormula):
                mat = patsy.dmatrix(v.formula, self.data,
                                    return_type="dataframe")
                mat = np.asarray(mat)[ix, :]
                if mat.shape[1] == 1:
                    mat = mat[:, 0]
                kwds[k] = mat
        return kwds
github statsmodels / statsmodels / statsmodels / sandbox / multilinear.py View on Github external
# data normalization
    # if None take all the numerical columns that are not present in the model
    # it's not waterproof but is a good enough criterion for everyday use
    if column_list is None:
        column_list = [name for name in dataframe.columns
                      if dataframe[name].dtype != object and name not in model]
    # if it's a single string transform it in a single element list
    if isinstance(column_list, str):
        column_list = [column_list]
    if subset is not None:
        dataframe = dataframe.loc[subset]
    # perform each model and retrieve the statistics
    col_results = {}
    # as the model will use always the same endogenous variables
    # we can create them once and reuse
    model_exog = dmatrix(model, data=dataframe, return_type="dataframe")
    for col_name in column_list:
        # it will try to interpret the column name as a valid dataframe
        # index as it can be several times faster. If it fails it
        # interpret it as a patsy formula (for example for centering)
        try:
            model_endog = dataframe[col_name]
        except KeyError:
            model_endog = dmatrix(col_name + ' + 0', data=dataframe)
        # retrieve the result and store them
        res = _model2dataframe(model_endog, model_exog, model_type, **kwargs)
        col_results[col_name] = res
    # mangle them togheter and sort by complexive p-value
    summary = pd.DataFrame(col_results)
    # order by the p-value: the most useful model first!
    summary = summary.T.sort_values([('pvals', '_f_test')])
    summary.index.name = 'endogenous vars'
github statsmodels / statsmodels / statsmodels / genmod / bayes_mixed_glm.py View on Github external
The data to which the formulas are applied.
        family : genmod.families instance
            A GLM family.
        vcp_p : float
            The prior standard deviation for the logarithms of the standard
            deviations of the random effects.
        fe_p : float
            The prior standard deviation for the fixed effects parameters.
        """

        ident = []
        exog_vc = []
        vcp_names = []
        j = 0
        for na, fml in vc_formulas.items():
            mat = patsy.dmatrix(fml, data, return_type='dataframe')
            exog_vc.append(mat)
            vcp_names.append(na)
            ident.append(j * np.ones(mat.shape[1], dtype=np.integer))
            j += 1
        exog_vc = pd.concat(exog_vc, axis=1)
        vc_names = exog_vc.columns.tolist()

        ident = np.concatenate(ident)

        model = super(_BayesMixedGLM, cls).from_formula(
            formula,
            data=data,
            family=family,
            subset=None,
            exog_vc=exog_vc,
            ident=ident,
github statsmodels / statsmodels / statsmodels / sandbox / predict_functional.py View on Github external
# The values of the 'focus variable' are a sequence of percentiles
    pctls = np.linspace(0, 100, num_points).tolist()
    fvals = np.percentile(exog[focus_var], pctls)
    fvals = np.asarray(fvals)
    fexog.loc[:, focus_var] = fvals

    # The values of the other variables may be given by summary functions...
    for ky in summaries.keys():
        fexog.loc[:, ky] = summaries[ky](exog.loc[:, ky])

    # or they may be provided as given values.
    for ky in values.keys():
        fexog.loc[:, ky] = values[ky]

    dexog = patsy.dmatrix(model.data.design_info, fexog,
                          return_type='dataframe')
    return dexog, fexog, fvals
github statsmodels / statsmodels / statsmodels / sandbox / predict_functional.py View on Github external
raise ValueError('confidence band method must be one of `pointwise`, `scheffe`, and `simultaneous`.')

    contrast = (values2 is not None) or (summaries2 is not None)

    if contrast and not linear:
        raise ValueError("`linear` must be True for computing contrasts")

    model = result.model
    if exog is not None:

        if any(x is not None for x in [summaries, summaries2, values, values2]):
            raise ValueError("if `exog` is provided then do not "
                             "provide `summaries` or `values`")

        fexog = exog
        dexog = patsy.dmatrix(model.data.design_info,
                              fexog, return_type='dataframe')
        fvals = exog[focus_var]

        if exog2 is not None:
            fexog2 = exog
            dexog2 = patsy.dmatrix(model.data.design_info,
                                   fexog2, return_type='dataframe')
            fvals2 = fvals

    else:

        values, summaries, values2, summaries2 = _check_args(values,
                                                             summaries,
                                                             values2,
                                                             summaries2)
github pzivich / zEpid / zepid / causal / utils.py View on Github external
elif var_type == 'categorical':
            if weighted:
                wt = np.average(dft[vcols], weights=dft[weight], axis=0)
                wn = np.average(dfn[vcols], weights=dfn[weight], axis=0)
            else:
                wt = np.mean(dft[vcols], axis=0)
                wn = np.mean(dfn[vcols], axis=0)

            t_c = wt - wn
            s_inv = np.linalg.inv(_categorical_cov_(a=wt, b=wn))
            return float(np.sqrt(np.dot(np.transpose(t_c[1:]), np.dot(s_inv, t_c[1:]))))

        else:
            raise ValueError('Not supported')

    variables = patsy.dmatrix(formula + ' - 1', df, return_type='dataframe')
    w_diff = []
    u_diff = []
    vlabel = []

    # Pull out list of terms and the corresponding dataframe slice(s)
    term_dict = variables.design_info.term_name_slices

    # Looping through the terms
    for term in variables.design_info.terms:
        # Adding term labels
        vlabel.append(term.name())

        # Pulling out data corresponding to term
        chunk = term_dict[term.name()]
        v = variables.iloc[:, chunk].copy()