Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
)
# do the actual reading
if ext == 'xlsx' or ext == 'xls':
if sheet is None:
raise ValueError(
"Provide `sheet` parameter when reading '.xlsx' files."
)
else:
adata = read_excel(filename, sheet)
elif ext in {'mtx', 'mtx.gz'}:
adata = read_mtx(filename)
elif ext == 'csv':
adata = read_csv(filename, first_column_names=first_column_names)
elif ext in {'txt', 'tab', 'data', 'tsv'}:
if ext == 'data':
logg.hint(
"... assuming '.data' means tab or white-space "
'separated text file',
)
logg.hint('change this by passing `ext` to sc.read')
adata = read_text(filename, delimiter, first_column_names)
elif ext == 'soft.gz':
adata = _read_softgz(filename)
elif ext == 'loom':
adata = read_loom(filename=filename, **kwargs)
else:
raise ValueError(f'Unknown extension {ext}.')
if cache:
logg.info(
f'... writing an {settings.file_format_data} '
'cache file to speedup reading next time'
)
"""
if n_pcs == 0:
logg.info(' using data matrix X directly (no PCA)')
return adata.X
elif n_pcs is None and 'X_pca' in adata.obsm_keys():
logg.info(f' using \'X_pca\' with n_pcs = {adata.obsm["X_pca"].shape[1]}')
return adata.obsm['X_pca']
elif ('X_pca' in adata.obsm_keys()
and adata.obsm['X_pca'].shape[1] >= n_pcs):
logg.info(f' using \'X_pca\' with n_pcs = {n_pcs}')
return adata.obsm['X_pca'][:, :n_pcs]
else:
n_pcs = N_PCS if n_pcs is None else n_pcs
if adata.X.shape[1] > n_pcs:
logg.info(f' computing \'X_pca\' with n_pcs = {n_pcs}')
logg.hint('avoid this by setting n_pcs = 0')
X = pca(adata.X, n_comps=n_pcs, random_state=random_state)
adata.obsm['X_pca'] = X
return X
else:
logg.info(' using data matrix X directly (no PCA)')
return adata.X
# uses full r_genes if ctrl_size > len(r_genes)
control_genes.update(set(r_genes[:ctrl_size]))
# To index, we need a list – indexing implies an order.
control_genes = list(control_genes - gene_list)
gene_list = list(gene_list)
X_list = _adata[:, gene_list].X
if issparse(X_list): X_list = X_list.toarray()
X_control = _adata[:, control_genes].X
if issparse(X_control): X_control = X_control.toarray()
X_control = np.nanmean(X_control, axis=1)
if len(gene_list) == 0:
# We shouldn't even get here, but just in case
logg.hint(
f'could not add \n'
f' {score_name!r}, score of gene set (adata.obs)'
)
return adata if copy else None
elif len(gene_list) == 1:
if _adata[:, gene_list].X.ndim == 2:
vector = _adata[:, gene_list].X.toarray()[:, 0] # new anndata
else:
vector = _adata[:, gene_list].X # old anndata
score = vector - X_control
else:
score = np.nanmean(X_list, axis=1) - X_control
adata.obs[score_name] = pd.Series(np.array(score).ravel(), index=adata.obs_names)
logg.info(
else:
logg.debug(f'reading sheet {sheet} from file {filename}')
return read_hdf(filename, sheet)
# read other file types
path_cache = settings.cachedir / _slugify(filename).replace('.' + ext, '.h5ad') # type: Path
if path_cache.suffix in {'.gz', '.bz2'}:
path_cache = path_cache.with_suffix('')
if cache and path_cache.is_file():
logg.info(f'... reading from cache file {path_cache}')
return read_h5ad(path_cache)
if not is_present:
raise FileNotFoundError(f'Did not find file {filename}.')
logg.debug(f'reading {filename}')
if not cache and not suppress_cache_warning:
logg.hint(
'This might be very slow. Consider passing `cache=True`, '
'which enables much faster reading from a cache file.'
)
# do the actual reading
if ext == 'xlsx' or ext == 'xls':
if sheet is None:
raise ValueError(
"Provide `sheet` parameter when reading '.xlsx' files."
)
else:
adata = read_excel(filename, sheet)
elif ext in {'mtx', 'mtx.gz'}:
adata = read_mtx(filename)
elif ext == 'csv':
adata = read_csv(filename, first_column_names=first_column_names)
elif ext in {'txt', 'tab', 'data', 'tsv'}:
df = df.loc[adata.var_names]
else:
df = df.loc[adata.var_names]
dispersion_norm = df.dispersions_norm.values
dispersion_norm[np.isnan(dispersion_norm)] = 0 # similar to Seurat
gene_subset = np.logical_and.reduce((
df.means > min_mean, df.means < max_mean,
df.dispersions_norm > min_disp,
df.dispersions_norm < max_disp,
))
df['highly_variable'] = gene_subset
logg.info(' finished', time=start)
if inplace or subset:
logg.hint(
'added\n'
' \'highly_variable\', boolean vector (adata.var)\n'
' \'means\', float vector (adata.var)\n'
' \'dispersions\', float vector (adata.var)\n'
' \'dispersions_norm\', float vector (adata.var)'
)
adata.var['highly_variable'] = df['highly_variable'].values
adata.var['means'] = df['means'].values
adata.var['dispersions'] = df['dispersions'].values
adata.var['dispersions_norm'] = df['dispersions_norm'].values.astype('float32', copy=False)
if batch_key is not None:
adata.var['highly_variable_nbatches'] = df['highly_variable_nbatches'].values
adata.var['highly_variable_intersection'] = df['highly_variable_intersection'].values
if subset:
adata._inplace_subset_var(df['highly_variable'].values)
else:
raise ValueError(
"Provide `sheet` parameter when reading '.xlsx' files."
)
else:
adata = read_excel(filename, sheet)
elif ext in {'mtx', 'mtx.gz'}:
adata = read_mtx(filename)
elif ext == 'csv':
adata = read_csv(filename, first_column_names=first_column_names)
elif ext in {'txt', 'tab', 'data', 'tsv'}:
if ext == 'data':
logg.hint(
"... assuming '.data' means tab or white-space "
'separated text file',
)
logg.hint('change this by passing `ext` to sc.read')
adata = read_text(filename, delimiter, first_column_names)
elif ext == 'soft.gz':
adata = _read_softgz(filename)
elif ext == 'loom':
adata = read_loom(filename=filename, **kwargs)
else:
raise ValueError(f'Unknown extension {ext}.')
if cache:
logg.info(
f'... writing an {settings.file_format_data} '
'cache file to speedup reading next time'
)
if cache_compression is _empty:
cache_compression = settings.cache_compression
if not path_cache.parent.is_dir():
path_cache.parent.mkdir(parents=True)
embed_x = adata.obsm[f'X_{basis}'][:, components[0]]
embed_y = adata.obsm[f'X_{basis}'][:, components[1]]
adata.obs[density_covariate] = _calc_density(embed_x, embed_y)
# Reduce diffmap components for labeling
# Note: plot_scatter takes care of correcting diffmap components
# for plotting automatically
if basis != 'diffmap':
components += 1
adata.uns[f'{density_covariate}_params'] = dict(
covariate=groupby, components=components.tolist()
)
logg.hint(
f"added\n"
f" '{density_covariate}', densities (adata.obs)\n"