Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
arguments_group.add_argument('-t', '--do_train', type=int, default=1, required=False,
help='Batch Size')
arguments_group.add_argument('-r', '--dropout_rate', type=float, default=0.5, required=False,
help='Dropout ratio')
args = vars(parser.parse_args())
data_dict = DATASETS[args['data']]
data_name = data_dict.get('name', None)
cell_type_key = data_dict.get("cell_type", None)
source_key = data_dict.get('source_key')
target_key = data_dict.get('target_key')
train_path = f"../data/{data_name}/train_{data_name}.h5ad"
valid_path = f"../data/{data_name}/valid_{data_name}.h5ad"
data = sc.read(train_path)
validation = sc.read(valid_path)
if sparse.issparse(data.X):
data.X = data.X.A
if sparse.issparse(validation.X):
validation.X = validation.X.A
# =============================== data gathering ====================================
spec_cell_types = data_dict.get('spec_cell_types', None)
cell_types = data.obs[cell_type_key].unique().tolist()
for spec_cell_type in spec_cell_types:
train_real = data.copy()[~((data.obs['condition'] == target_key) & (data.obs[cell_type_key] == spec_cell_type))]
train_real_stim = train_real[train_real.obs["condition"] == target_key]
train_real_ctrl = train_real[train_real.obs["condition"] == source_key]
train_real_stim = train_real_stim.X
mito_genes = [name for name in adata.var_names if name.split('.')[0] in gencodeMitos]
if(len(mito_genes)==0): # no single mitochondrial gene in the expression matrix ?
pipeLog("WARNING - No single mitochondrial gene was found in the expression matrix.")
pipeLog("Dying cells cannot be removed - please check your expression matrix")
doMito = False
else:
doMito = True
adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1) / np.sum(adata.X, axis=1)
adata.obs['UMI_Count'] = np.sum(adata.X, axis=1)
sc.pl.violin(adata, ['n_genes', 'UMI_Count', 'percent_mito'], jitter=0.4, multi_panel=True)
fig1=sc.pl.scatter(adata, x='UMI_Count', y='percent_mito', save="_percent_mito")
fig2=sc.pl.scatter(adata, x='UMI_Count', y='n_genes', save="_gene_count")
adata = adata[adata.obs['percent_mito'] < thrsh_mito, :]
if conf.get("doFilterGenes", True):
up_thrsh_genes=conf.get("filterMaxGenes", 15000)
low_thrsh_genes=conf.get("filterMinGenes", 10)
pipeLog("Remove cells with less than %d and more than %d genes" % (low_thrsh_genes, up_thrsh_genes))
#Filtering out cells according to filter parameters
pipeLog('Filtering cells')
adata = adata[adata.obs['n_genes'] < up_thrsh_genes, :]
adata = adata[adata.obs['n_genes'] > low_thrsh_genes, :]
pipeLog("After filtering: Data has %d samples/observations and %d genes/variables" % (len(adata.obs), len(adata.var)))
if conf.get("doNormalize", True):
# =============================== downloading training and validation files ====================================
# we do not use the validation data to apply vectroe arithmetics in gene expression space
train_path = "../data/train_kang.h5ad"
valid_path = "../data/valid_kang.h5ad"
if os.path.isfile(train_path):
data = sc.read(train_path)
else:
train_url = "https://drive.google.com/uc?export=download&id=1-RpxbXwXEJLYZDFSHnWYenojZ8TxRZsP"
t_dl = wget.download(train_url, train_path)
data = sc.read(train_path)
if os.path.isfile(valid_path):
validation = sc.read(valid_path)
else:
train_url = "https://drive.google.com/uc?export=download&id=1-RpxbXwXEJLYZDFSHnWYenojZ8TxRZsP"
t_dl = wget.download(train_url, valid_path)
validation = sc.read(valid_path)
# =============================== data gathering ====================================
#training cells
t_in = ['CD8T','NK','B','Dendritic', 'FCGR3A+Mono','CD14+Mono']
#heldout cells
t_out = [ 'CD4T']
dr = data_reader(data, validation,{"ctrl":"control", "stim":"stimulated"}, t_in, t_out)
train_real_cd = dr.train_real_adata[dr.train_real_adata.obs["condition"] == "control",:]
train_real_cd = dr.balancer(train_real_cd)
train_real_stimulated = dr.train_real_adata[dr.train_real_adata.obs["condition"] == "stimulated",:]
from hf import *
import numpy as np
import scanpy.api as sc
import os
from data_reader import data_reader
import wget
# =============================== downloading training and validation files ====================================
# we do not use the validation data to apply vectroe arithmetics in gene expression space
train_path = "../data/train_kang.h5ad"
valid_path = "../data/valid_kang.h5ad"
if os.path.isfile(train_path):
data = sc.read(train_path)
else:
train_url = "https://drive.google.com/uc?export=download&id=1-RpxbXwXEJLYZDFSHnWYenojZ8TxRZsP"
t_dl = wget.download(train_url, train_path)
data = sc.read(train_path)
if os.path.isfile(valid_path):
validation = sc.read(valid_path)
else:
train_url = "https://drive.google.com/uc?export=download&id=1-RpxbXwXEJLYZDFSHnWYenojZ8TxRZsP"
t_dl = wget.download(train_url, valid_path)
validation = sc.read(valid_path)
# =============================== data gathering ====================================
#training cells
t_in = ['CD8T','NK','B','Dendritic', 'FCGR3A+Mono','CD14+Mono']
#heldout cells
t_out = [ 'CD4T']
import numpy as np
import scanpy.api as sc
import os
from data_reader import data_reader
import wget
from sklearn.decomposition import PCA
# =============================== downloading training and validation files ====================================
# we do not use the validation data to apply vectroe arithmetics in gene expression space
train_path = "../data/train_kang.h5ad"
valid_path = "../data/valid_kang.h5ad"
if os.path.isfile(train_path):
data = sc.read(train_path)
else:
train_url = "https://drive.google.com/uc?export=download&id=1-RpxbXwXEJLYZDFSHnWYenojZ8TxRZsP"
t_dl = wget.download(train_url, train_path)
data = sc.read(train_path)
if os.path.isfile(valid_path):
validation = sc.read(valid_path)
else:
train_url = "https://drive.google.com/uc?export=download&id=1-RpxbXwXEJLYZDFSHnWYenojZ8TxRZsP"
t_dl = wget.download(train_url, valid_path)
validation = sc.read(valid_path)
# =============================== data gathering ====================================
#training cells
t_in = ['CD8T','NK','B','Dendritic', 'FCGR3A+Mono','CD14+Mono']
#heldout cells
t_out = [ 'CD4T']
def louvain(X, N, resolution=1, seed=None, replace=False):
from anndata import AnnData
import scanpy.api as sc
adata = AnnData(X=X)
sc.pp.neighbors(adata, use_rep='X')
sc.tl.louvain(adata, resolution=resolution, key_added='louvain')
cluster_labels_full = adata.obs['louvain'].tolist()
louv = {}
for i, cluster in enumerate(cluster_labels_full):
if cluster not in louv:
louv[cluster] = []
louv[cluster].append(i)
lv_idx = []
for n in range(N):
louv_cells = list(louv.keys())
louv_cell = louv_cells[np.random.choice(len(louv_cells))]
samples = list(louv[louv_cell])
sample = samples[np.random.choice(len(samples))]
if not replace:
full_labels = label_approx(X_dimred, X_dimred[samp_idx, :],
spect.labels_, k=5)
bnmi = normalized_mutual_info_score(
cell_labels, full_labels, dist='balanced'
)
nmi = normalized_mutual_info_score(cell_labels, full_labels)
stats.append(nmi)
stats.append(bnmi)
if 'louvain_ami' in kwargs and kwargs['louvain_ami']:
cell_labels = kwargs['cell_labels']
adata = AnnData(X=X_dimred[samp_idx, :])
sc.pp.neighbors(adata, use_rep='X')
amis = []
bamis = []
for r in [ 0.5, 1., 2. ]:
sc.tl.louvain(adata, resolution=r, key_added='louvain')
louv_labels = np.array(adata.obs['louvain'].tolist())
full_labels = label_approx(X_dimred, X_dimred[samp_idx, :],
louv_labels, k=5)
ami = adjusted_mutual_info_score(cell_labels, full_labels)
bami = adjusted_mutual_info_score(
cell_labels, full_labels, dist='balanced'
)
amis.append(ami)
from data_reader import data_reader
# =============================== downloading training and validation files ====================================
train_path = "../data/train_kang.h5ad"
valid_path = "../data/valid_kang.h5ad"
if os.path.isfile(train_path):
data = sc.read(train_path)
else:
train_url = "https://drive.google.com/uc?export=download&id=1-RpxbXwXEJLYZDFSHnWYenojZ8TxRZsP"
t_dl = wget.download(train_url, train_path)
data = sc.read(train_path)
if os.path.isfile(valid_path):
validation = sc.read(valid_path)
else:
train_url = "https://drive.google.com/uc?export=download&id=1-RpxbXwXEJLYZDFSHnWYenojZ8TxRZsP"
t_dl = wget.download(train_url,valid_path)
validation = sc.read(valid_path)
# =============================== data gathering ====================================
#training cells
t_in = ['CD8T','NK','B','Dendritic', 'FCGR3A+Mono','CD14+Mono']
#heldout cells
t_out = [ 'CD4T']
dr = data_reader(data, validation,{"ctrl":"control", "stim":"stimulated"}, t_in, t_out)
train_real = dr.train_real_adata
valid_real = dr.valid_real_adata
import tensorflow as tf
import numpy as np
import os
import scanpy.api as sc
import wget
from data_reader import data_reader
from random import shuffle
# =============================== downloading training and validation files ====================================
train_path = "../data/train_kang.h5ad"
valid_path = "../data/valid_kang.h5ad"
if os.path.isfile(train_path):
data = sc.read(train_path)
else:
train_url = "https://drive.google.com/uc?export=download&id=1-RpxbXwXEJLYZDFSHnWYenojZ8TxRZsP"
t_dl = wget.download(train_url, train_path)
data = sc.read(train_path)
if os.path.isfile(valid_path):
validation = sc.read(valid_path)
else:
train_url = "https://drive.google.com/uc?export=download&id=1-RpxbXwXEJLYZDFSHnWYenojZ8TxRZsP"
t_dl = wget.download(train_url, valid_path)
validation = sc.read(valid_path)
# =============================== data gathering ====================================
#training cells
t_in = ['CD8T','NK','B','Dendritic', 'FCGR3A+Mono','CD14+Mono']
#heldout cells
t_out = [ 'CD4T']
def readMatrixAnndata(matrixFname, samplesOnRows=False):
" read an expression matrix and return an adata object. Supports .mtx, .h5 and .tsv (not .tsv.gz) "
import scanpy.api as sc
#adata = sc.read(matFname)
if matrixFname.endswith(".mtx"):
import pandas as pd
logging.info("Loading expression matrix: mtx format")
adata = sc.read(matrixFname, cache=False).T
mtxDir = dirname(matrixFname)
adata.var_names = pd.read_csv(join(mtxDir, 'genes.tsv'), header=None, sep='\t')[1]
adata.obs_names = pd.read_csv(join(mtxDir, 'barcodes.tsv'), header=None)[0]
else:
logging.info("Loading expression matrix: tab-sep format")
adata = sc.read(matrixFname, cache=False , first_column_names=True)
if not samplesOnRows:
logging.info("Transposing the expression matrix")
adata = adata.T
return adata