Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_harmonic_transform():
x = np.arange(735688, 735688 + 100, 1)
design = patsy.dmatrix('0 + harm(x, 1)')
truth = np.vstack((np.cos(2 * np.pi / 365.25 * x),
np.sin(2 * np.pi / 365.25 * x))).T
np.testing.assert_equal(np.asarray(design), truth)
if categorical is not None:
data = data.copy()
cats = listify(categorical)
data[cats] = data[cats].apply(lambda x: x.astype("category"))
# Custom patsy.missing.NAAction class. Similar to patsy drop/raise
# defaults, but changes the raised message and logs any dropped rows
NA_handler = Custom_NA(dropna=self.dropna)
# screen fixed terms
if fixed is not None:
if "~" in fixed:
clean_fix = re.sub(r"\[.+\]", "", fixed)
dmatrices(clean_fix, data=data, NA_action=NA_handler)
else:
dmatrix(fixed, data=data, NA_action=NA_handler)
# screen random terms
if random is not None:
for term in listify(random):
for side in term.split("|"):
dmatrix(side, data=data, NA_action=NA_handler)
# update the running list of complete cases
if NA_handler.completes:
self.completes.append(NA_handler.completes)
# save arguments to pass to _add()
args = dict(
zip(
["fixed", "random", "priors", "family", "link", "categorical"],
[fixed, random, priors, family, link, categorical],
# Step 2) Estimation under the scenarios
dfx = self.df.copy()
dfx[self.exposure] = 1
self.QA1W = log.predict(dfx)
dfx = self.df.copy()
dfx[self.exposure] = 0
self.QA0W = log.predict(dfx)
# User-specified model
else:
# TODO need to create smart warning system
# warnings.warn("TMLE can result in confidence intervals below nominal coverage when used with "
# "certain machine learning algorithms")
self._out_model_custom = True
data = patsy.dmatrix(model + ' - 1', cc)
dfx = self.df.copy()
dfx[self.exposure] = 1
adata = patsy.dmatrix(model + ' - 1', dfx)
dfx = self.df.copy()
dfx[self.exposure] = 0
ndata = patsy.dmatrix(model + ' - 1', dfx)
self.QA1W, self.QA0W = outcome_machine_learner(xdata=np.asarray(data),
ydata=np.asarray(cc[self.outcome]),
all_a=adata, none_a=ndata,
ml_model=custom_model,
continuous=self._continuous_outcome,
print_results=print_results)
if not bound: # Bounding predicted probabilities if requested
def _process_kwds(self, kwds, ix):
kwds = kwds.copy()
for k in kwds:
v = kwds[k]
if isinstance(v, PatsyFormula):
mat = patsy.dmatrix(v.formula, self.data,
return_type="dataframe")
mat = np.asarray(mat)[ix, :]
if mat.shape[1] == 1:
mat = mat[:, 0]
kwds[k] = mat
return kwds
# data normalization
# if None take all the numerical columns that are not present in the model
# it's not waterproof but is a good enough criterion for everyday use
if column_list is None:
column_list = [name for name in dataframe.columns
if dataframe[name].dtype != object and name not in model]
# if it's a single string transform it in a single element list
if isinstance(column_list, str):
column_list = [column_list]
if subset is not None:
dataframe = dataframe.loc[subset]
# perform each model and retrieve the statistics
col_results = {}
# as the model will use always the same endogenous variables
# we can create them once and reuse
model_exog = dmatrix(model, data=dataframe, return_type="dataframe")
for col_name in column_list:
# it will try to interpret the column name as a valid dataframe
# index as it can be several times faster. If it fails it
# interpret it as a patsy formula (for example for centering)
try:
model_endog = dataframe[col_name]
except KeyError:
model_endog = dmatrix(col_name + ' + 0', data=dataframe)
# retrieve the result and store them
res = _model2dataframe(model_endog, model_exog, model_type, **kwargs)
col_results[col_name] = res
# mangle them togheter and sort by complexive p-value
summary = pd.DataFrame(col_results)
# order by the p-value: the most useful model first!
summary = summary.T.sort_values([('pvals', '_f_test')])
summary.index.name = 'endogenous vars'
The data to which the formulas are applied.
family : genmod.families instance
A GLM family.
vcp_p : float
The prior standard deviation for the logarithms of the standard
deviations of the random effects.
fe_p : float
The prior standard deviation for the fixed effects parameters.
"""
ident = []
exog_vc = []
vcp_names = []
j = 0
for na, fml in vc_formulas.items():
mat = patsy.dmatrix(fml, data, return_type='dataframe')
exog_vc.append(mat)
vcp_names.append(na)
ident.append(j * np.ones(mat.shape[1], dtype=np.integer))
j += 1
exog_vc = pd.concat(exog_vc, axis=1)
vc_names = exog_vc.columns.tolist()
ident = np.concatenate(ident)
model = super(_BayesMixedGLM, cls).from_formula(
formula,
data=data,
family=family,
subset=None,
exog_vc=exog_vc,
ident=ident,
# The values of the 'focus variable' are a sequence of percentiles
pctls = np.linspace(0, 100, num_points).tolist()
fvals = np.percentile(exog[focus_var], pctls)
fvals = np.asarray(fvals)
fexog.loc[:, focus_var] = fvals
# The values of the other variables may be given by summary functions...
for ky in summaries.keys():
fexog.loc[:, ky] = summaries[ky](exog.loc[:, ky])
# or they may be provided as given values.
for ky in values.keys():
fexog.loc[:, ky] = values[ky]
dexog = patsy.dmatrix(model.data.design_info, fexog,
return_type='dataframe')
return dexog, fexog, fvals
raise ValueError('confidence band method must be one of `pointwise`, `scheffe`, and `simultaneous`.')
contrast = (values2 is not None) or (summaries2 is not None)
if contrast and not linear:
raise ValueError("`linear` must be True for computing contrasts")
model = result.model
if exog is not None:
if any(x is not None for x in [summaries, summaries2, values, values2]):
raise ValueError("if `exog` is provided then do not "
"provide `summaries` or `values`")
fexog = exog
dexog = patsy.dmatrix(model.data.design_info,
fexog, return_type='dataframe')
fvals = exog[focus_var]
if exog2 is not None:
fexog2 = exog
dexog2 = patsy.dmatrix(model.data.design_info,
fexog2, return_type='dataframe')
fvals2 = fvals
else:
values, summaries, values2, summaries2 = _check_args(values,
summaries,
values2,
summaries2)
elif var_type == 'categorical':
if weighted:
wt = np.average(dft[vcols], weights=dft[weight], axis=0)
wn = np.average(dfn[vcols], weights=dfn[weight], axis=0)
else:
wt = np.mean(dft[vcols], axis=0)
wn = np.mean(dfn[vcols], axis=0)
t_c = wt - wn
s_inv = np.linalg.inv(_categorical_cov_(a=wt, b=wn))
return float(np.sqrt(np.dot(np.transpose(t_c[1:]), np.dot(s_inv, t_c[1:]))))
else:
raise ValueError('Not supported')
variables = patsy.dmatrix(formula + ' - 1', df, return_type='dataframe')
w_diff = []
u_diff = []
vlabel = []
# Pull out list of terms and the corresponding dataframe slice(s)
term_dict = variables.design_info.term_name_slices
# Looping through the terms
for term in variables.design_info.terms:
# Adding term labels
vlabel.append(term.name())
# Pulling out data corresponding to term
chunk = term_dict[term.name()]
v = variables.iloc[:, chunk].copy()