Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
use_raw
Use `raw` attribute of `adata` if present.
.. versionchanged:: 1.4.5
Default value changed from `False` to `None`.
Returns
-------
Depending on `copy`, returns or updates `adata` with an additional field
`score_name`.
Examples
--------
See this `notebook `__.
"""
start = logg.info(f'computing score {score_name!r}')
adata = adata.copy() if copy else adata
if random_state is not None:
np.random.seed(random_state)
gene_list_in_var = []
var_names = adata.raw.var_names if use_raw else adata.var_names
genes_to_ignore = []
for gene in gene_list:
if gene in var_names:
gene_list_in_var.append(gene)
else:
genes_to_ignore.append(gene)
if len(genes_to_ignore) > 0:
logg.warning(f'genes are not in var_names and ignored: {genes_to_ignore}')
gene_list = set(gene_list_in_var[:])
evals, evecs = scipy.linalg.eigh(matrix)
else:
n_comps = min(matrix.shape[0]-1, n_comps)
# ncv = max(2 * n_comps + 1, int(np.sqrt(matrix.shape[0])))
ncv = None
which = 'LM' if sort == 'decrease' else 'SM'
# it pays off to increase the stability with a bit more precision
matrix = matrix.astype(np.float64)
evals, evecs = scipy.sparse.linalg.eigsh(
matrix, k=n_comps, which=which, ncv=ncv
)
evals, evecs = evals.astype(np.float32), evecs.astype(np.float32)
if sort == 'decrease':
evals = evals[::-1]
evecs = evecs[:, ::-1]
logg.info(
' eigenvalues of transition matrix\n'
' {}'.format(str(evals).replace('\n', '\n '))
)
if self._number_connected_components > len(evals)/2:
logg.warning('Transition matrix has many disconnected components!')
self._eigen_values = evals
self._eigen_basis = evecs
'(https://github.com/DmitryUlyanov/Multicore-TSNE). '
'Even for n_jobs=1 this speeds up the computation considerably '
'and might yield better converged results.'
)
if X_tsne is None:
from sklearn.manifold import TSNE
from . import _tsne_fix # fix by D. DeTomaso for sklearn < 0.19
# unfortunately, sklearn does not allow to set a minimum number
# of iterations for barnes-hut tSNE
tsne = TSNE(**params_sklearn)
logg.info(' using sklearn.manifold.TSNE with a fix by D. DeTomaso')
X_tsne = tsne.fit_transform(X)
# update AnnData instance
adata.obsm['X_tsne'] = X_tsne # annotate samples with tSNE coordinates
logg.info(
' finished',
time=start,
deep="added\n 'X_tsne', tSNE coordinates (adata.obsm)",
)
return adata if copy else None
pp.filter_genes(adata, min_counts=1)
# normalize with total UMI count per cell
normalize_total(adata, key_added='n_counts_all')
filter_result = filter_genes_dispersion(
adata.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False
)
if plot: # should not import at the top of the file
from ..plotting import _preprocessing as ppp
ppp.filter_genes_dispersion(filter_result, log=True)
# actually filter the genes, the following is the inplace version of
# adata = adata[:, filter_result.gene_subset]
adata._inplace_subset_var(filter_result.gene_subset) # filter genes
normalize_total(adata) # renormalize after filtering
if log: pp.log1p(adata) # log transform: X = log(X + 1)
pp.scale(adata)
logg.info(' finished', time=start)
return adata if copy else None
def _diffmap(adata, n_comps=15):
start = logg.info(f'computing Diffusion Maps using n_comps={n_comps}(=n_dcs)')
dpt = DPT(adata)
dpt.compute_transitions()
dpt.compute_eigen(n_comps=n_comps)
adata.obsm['X_diffmap'] = dpt.eigen_basis
adata.uns['diffmap_evals'] = dpt.eigen_values
logg.info(
' finished',
time=start,
deep=(
'added\n'
' \'X_diffmap\', diffmap coordinates (adata.obsm)\n'
pca_projections, var_r = palantir.utils.run_pca(data_df)
adata.uns['palantir_pca_results'] = dict(
pca_projections=pca_projections,
variance_ratio=var_r,
)
logg.info('Diffusion maps in progress ...')
dm_res = adata.uns['palantir_diff_maps'] = \
palantir.utils.run_diffusion_maps(pca_projections)
ms_data = adata.uns['palantir_ms_data'] = \
palantir.utils.determine_multiscale_space(dm_res)
logg.info('tSNE in progress ...')
adata.uns['palantir_tsne'] = palantir.utils.run_tsne(ms_data)
logg.info('imputation in progress ...')
adata.uns['palantir_imp_df'] = \
palantir.utils.run_magic_imputation(data_df, dm_res)
logg.info('End of processing, start plotting.')
return None if inplace else adata
def _check_datafile_present_and_download(path, backup_url=None):
"""Check whether the file is present, otherwise download.
"""
path = Path(path)
if path.is_file(): return True
if backup_url is None: return False
logg.info(
f'try downloading from url\n{backup_url}\n'
'... this may take a while but only happens once'
)
if not path.parent.is_dir():
logg.info(f'creating directory {path.parent}/ for saving data')
path.parent.mkdir(parents=True)
_download(backup_url, path)
return True
# chunked calculation is not randomized, anyways
if svd_solver in {'auto', 'randomized'} and not chunked:
logg.info(
'Note that scikit-learn\'s randomized PCA might not be exactly '
'reproducible across different computational platforms. For exact '
'reproducibility, choose `svd_solver=\'arpack\'.` This will likely '
'become the Scanpy default in the future.'
)
data_is_AnnData = isinstance(data, AnnData)
if data_is_AnnData:
adata = data.copy() if copy else data
else:
adata = AnnData(data)
start = logg.info(f'computing PCA with n_comps = {n_comps}')
if adata.n_vars < n_comps:
n_comps = adata.n_vars - 1
logg.debug(
f'reducing number of computed PCs to {n_comps} '
f'as dim of data is only {adata.n_vars}'
)
if use_highly_variable is True and 'highly_variable' not in adata.var.keys():
raise ValueError('Did not find adata.var[\'highly_variable\']. '
'Either your data already only consists of highly-variable genes '
'or consider running `pp.filter_genes_dispersion` first.')
if use_highly_variable is None:
use_highly_variable = True if 'highly_variable' in adata.var.keys() else False
if use_highly_variable:
logg.info('computing PCA on highly variable genes')
data[:] = dsets['data']
matrix = csr_matrix(
(data, dsets['indices'], dsets['indptr']),
shape=(N, M),
)
# the csc matrix is automatically the transposed csr matrix
# as scanpy expects it, so, no need for a further transpostion
adata = AnnData(
matrix,
dict(obs_names=dsets['barcodes'].astype(str)),
dict(
var_names=dsets['gene_names'].astype(str),
gene_ids=dsets['genes'].astype(str),
),
)
logg.info('', time=start)
return adata
except KeyError:
raise Exception('File is missing one or more required datasets.')
Returns
-------
asso_names
List of associated reference names
(`max_n_names` for each predicted name).
asso_matrix
Matrix where rows correspond to the predicted labels and columns to the
reference labels, entries are proportional to degree of association.
"""
if normalization not in {'prediction', 'reference'}:
raise ValueError('`normalization` needs to be either "prediction" or "reference".')
sanitize_anndata(adata)
cats = adata.obs[reference].cat.categories
for cat in cats:
if cat in settings.categories_to_ignore:
logg.info(
f'Ignoring category {cat!r} '
'as it’s in `settings.categories_to_ignore`.'
)
asso_names = []
asso_matrix = []
for ipred_group, pred_group in enumerate(
adata.obs[prediction].cat.categories):
if '?' in pred_group: pred_group = str(ipred_group)
# starting from numpy version 1.13, subtractions of boolean arrays are deprecated
mask_pred = adata.obs[prediction].values == pred_group
mask_pred_int = mask_pred.astype(np.int8)
asso_matrix += [[]]
for ref_group in adata.obs[reference].cat.categories:
mask_ref = (adata.obs[reference].values == ref_group).astype(np.int8)
mask_ref_or_pred = mask_ref.copy()
mask_ref_or_pred[mask_pred] = 1