Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _test_anndata_raw(self, sparse):
data, sample_description = self.simulate()
gene_names = ["gene" + str(i) for i in range(data.shape[1])]
if sparse:
data = scipy.sparse.csr_matrix(data)
data = anndata.AnnData(data)
data.var_names = gene_names
data.raw = data
self._test_wald(data=data.raw, sample_description=sample_description)
self._test_lrt(data=data.raw, sample_description=sample_description)
self._test_t_test(data=data, sample_description=sample_description)
self._test_rank(data=data, sample_description=sample_description)
def test_reg_mean_plot():
train = sc.read("./tests/data/train.h5ad", backup_url="https://goo.gl/33HtVh")
network = scgen.VAEArith(x_dimension=train.shape[1], model_path="../models/test")
network.train(train_data=train, n_epochs=0)
unperturbed_data = train[((train.obs["cell_type"] == "CD4T") & (train.obs["condition"] == "control"))]
condition = {"ctrl": "control", "stim": "stimulated"}
pred, delta = network.predict(adata=train, adata_to_predict=unperturbed_data, conditions=condition,
condition_key="condition",cell_type_key="cell_type")
pred_adata = anndata.AnnData(pred, obs={"condition": ["pred"] * len(pred)}, var={"var_names": train.var_names})
CD4T = train[train.obs["cell_type"] == "CD4T"]
all_adata = CD4T.concatenate(pred_adata)
scgen.plotting.reg_mean_plot(all_adata, condition_key="condition", axis_keys={"x": "control", "y": "pred"},
path_to_save="tests/reg_mean1.pdf")
scgen.plotting.reg_mean_plot(all_adata, condition_key="condition", axis_keys={"x": "control", "y": "pred"},
path_to_save="tests/reg_mean2.pdf", gene_list=["ISG15", "CD3D"])
scgen.plotting.reg_mean_plot(all_adata,condition_key="condition", axis_keys={"x": "control", "y": "pred", "y1": "stimulated"},
path_to_save="tests/reg_mean3.pdf")
scgen.plotting.reg_mean_plot(all_adata, condition_key="condition", axis_keys={"x": "control", "y": "pred", "y1": "stimulated"},
gene_list=["ISG15", "CD3D"], path_to_save="tests/reg_mean.pdf",)
network.sess.close()
if adata_source.shape[0] == 0:
adata_source = pred_adatas.copy()[pred_adatas.obs[condition_key] == source_condition]
if adata_target.shape[0] == 0:
adata_target = pred_adatas.copy()[pred_adatas.obs[condition_key] == target_condition]
source_labels = np.zeros(adata_source.shape[0]) + source_label
target_labels = np.zeros(adata_source.shape[0]) + target_label
pred_target = network.predict(adata_source,
encoder_labels=source_labels,
decoder_labels=target_labels,
size_factor=adata_source.obs['size_factors'].values
)
pred_adata = anndata.AnnData(X=pred_target)
pred_adata.obs[condition_key] = [name] * pred_target.shape[0]
pred_adata.var_names = adata.var_names
if sparse.issparse(adata_source.X):
adata_source.X = adata_source.X.A
if sparse.issparse(adata_target.X):
adata_target.X = adata_target.X.A
if sparse.issparse(pred_adata.X):
pred_adata.X = pred_adata.X.A
# adata_to_plot = pred_adata.concatenate(adata_target)
# trvae.plotting.reg_mean_plot(adata_to_plot,
# top_100_genes=top_100_genes,
show=False)
decoded_latent_with_true_labels = network.predict(data=latent_with_true_labels, encoder_labels=true_labels,
decoder_labels=true_labels, data_space='latent')
cell_type_data = train[train.obs[cell_type_key] == cell_type]
unperturbed_data = train[((train.obs[cell_type_key] == cell_type) & (train.obs[condition_key] == ctrl_key))]
true_labels = np.zeros((len(unperturbed_data), 1))
fake_labels = np.ones((len(unperturbed_data), 1))
sc.tl.rank_genes_groups(cell_type_data, groupby=condition_key, n_genes=100)
diff_genes = cell_type_data.uns["rank_genes_groups"]["names"][stim_key]
# cell_type_data = cell_type_data.copy()[:, diff_genes.tolist()]
pred = network.predict(data=unperturbed_data, encoder_labels=true_labels, decoder_labels=fake_labels)
pred_adata = anndata.AnnData(pred, obs={condition_key: ["pred"] * len(pred)},
var={"var_names": cell_type_data.var_names})
all_adata = cell_type_data.concatenate(pred_adata)
scgen.plotting.reg_mean_plot(all_adata, condition_key=condition_key,
axis_keys={"x": ctrl_key, "y": stim_key, "y1": "pred"},
gene_list=diff_genes,
path_to_save=f"./figures/reg_mean_{z_dim}.pdf")
scgen.plotting.reg_var_plot(all_adata, condition_key=condition_key,
axis_keys={"x": ctrl_key, "y": stim_key, 'y1': "pred"},
gene_list=diff_genes,
path_to_save=f"./figures/reg_var_{z_dim}.pdf")
sc.pp.neighbors(all_adata)
sc.tl.umap(all_adata)
sc.pl.umap(all_adata, color=condition_key,
save="pred")
def test_anndata(self):
ad = self.ds.to_anndata()
ad.write_h5ad("./test.h5ad")
ds = cb.data.ExprDataSet.from_anndata(anndata.read_h5ad("./test.h5ad"))
self._compare_datasets(self.ds, ds)
:param data: Input data matrix (observations x features) or (cells x genes).
:param sample_description: pandas.DataFrame containing sample annotations, can be None.
:return: Assembled sample annotations.
"""
if sample_description is None:
if anndata is not None and isinstance(data, anndata.AnnData):
sample_description = data.obs
else:
raise ValueError(
"Please specify `sample_description` or provide `data` as anndata.AnnData " +
"with corresponding sample annotations"
)
if sample_description is not None:
if anndata is not None and isinstance(data, Raw):
# Raw does not have attribute shape.
assert data.X.shape[0] == sample_description.shape[0], \
"data matrix and sample description must contain same number of cells: %i, %i" % \
(data.X.shape[0], sample_description.shape[0])
elif isinstance(data, glm.typing.InputDataBase):
assert data.x.shape[0] == sample_description.shape[0], \
"data matrix and sample description must contain same number of cells: %i, %i" % \
(data.x.shape[0], sample_description.shape[0])
else:
assert data.shape[0] == sample_description.shape[0], \
"data matrix and sample description must contain same number of cells: %i, %i" % \
(data.shape[0], sample_description.shape[0])
return sample_description
gene_names: Union[np.ndarray, list] = None,
sample_description: pd.DataFrame = None
):
"""
:param data: Array-like or anndata.Anndata object containing observations.
Input data matrix (observations x features) or (cells x genes).
:param parts: str, array
- column in data.obs/sample_description which contains the split of observations into the two groups.
- array of length `num_observations` containing group labels
:param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these
:param sample_description: optional pandas.DataFrame containing sample annotations
"""
if isinstance(data, glm.typing.InputDataBase):
self.x = data.x
elif isinstance(data, anndata.AnnData) or isinstance(data, Raw):
self.x = data.X
elif isinstance(data, np.ndarray):
self.x = data
else:
raise ValueError("data type %s not recognized" % type(data))
self.gene_names = parse_gene_names(data, gene_names)
self.sample_description = parse_sample_description(data, sample_description)
self.partition = parse_grouping(data, sample_description, parts)
self.partitions = np.unique(self.partition)
self.partition_idx = [np.where(self.partition == x)[0] for x in self.partitions]
if lazy:
de_test = DifferentialExpressionTestZTestLazy(
model_estim=model,
grouping=grouping,
groups=np.unique(grouping),
correction_type=pval_correction
)
else:
de_test = DifferentialExpressionTestZTest(
model_estim=model,
grouping=grouping,
groups=np.unique(grouping),
correction_type=pval_correction
)
else:
if isinstance(data, anndata.AnnData) or isinstance(data, anndata.Raw):
data = data.X
elif isinstance(data, glm.typing.InputDataBase):
data = data.x
groups = np.unique(grouping)
pvals = np.tile(np.NaN, [len(groups), len(groups), data.shape[1]])
pvals[np.eye(pvals.shape[0]).astype(bool)] = 0
logfc = np.tile(np.NaN, [len(groups), len(groups), data.shape[1]])
logfc[np.eye(logfc.shape[0]).astype(bool)] = 0
if keep_full_test_objs:
tests = np.tile([None], [len(groups), len(groups)])
else:
tests = None
for i, g1 in enumerate(groups):
for j, g2 in enumerate(groups[(i + 1):]):
self._check_data(data)
n_pca, rank_threshold = self._parse_n_pca_threshold(data, n_pca, rank_threshold)
try:
if isinstance(data, pd.SparseDataFrame):
data = data.to_coo()
elif isinstance(data, pd.DataFrame):
try:
data = data.sparse.to_coo()
except AttributeError:
data = np.array(data)
except NameError:
# pandas not installed
pass
try:
if isinstance(data, anndata.AnnData):
data = data.X
except NameError:
# anndata not installed
pass
self.data = data
self.n_pca = n_pca
self.rank_threshold = rank_threshold
self.random_state = random_state
self.data_nu = self._reduce_data()
super().__init__(**kwargs)
If `return_info` is true, all estimated distribution parameters are stored in AnnData such as:
- `.obsm["X_dca_dropout"]` which is the mixture coefficient (pi) of the zero component
in ZINB, i.e. dropout probability. (Only if ae_type is zinb or zinb-conddisp)
- `.obsm["X_dca_dispersion"]` which is the dispersion parameter of NB.
- `.uns["dca_loss_history"]` which stores the loss history of the training.
Finally, the raw counts are stored as `.raw`.
If `return_model` is given, trained model is returned. When both `copy` and `return_model`
are true, a tuple of anndata and model is returned in that order.
"""
assert isinstance(adata, anndata.AnnData), 'adata must be an AnnData instance'
assert mode in ('denoise', 'latent'), '%s is not a valid mode.' % mode
# set seed for reproducibility
random.seed(random_state)
np.random.seed(random_state)
tf.set_random_seed(random_state)
os.environ['PYTHONHASHSEED'] = '0'
# this creates adata.raw with raw counts and copies adata if copy==True
adata = read_dataset(adata,
transpose=False,
test_split=False,
copy=copy)
# check for zero genes
nonzero_genes, _ = sc.pp.filter_genes(adata.X, min_counts=1)