Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
assert self.fignum > 0
import glob
from bs4 import BeautifulSoup
# parsing files.......
try:
results_path = glob.glob(self.indir+'*/edb/results.edb')[0]
rank_path = glob.glob(self.indir+'*/edb/*.rnk')[0]
gene_set_path = glob.glob(self.indir+'*/edb/gene_sets.gmt')[0]
except IndexError as e:
sys.stderr.write("Could not locate GSEA files in the given directory!")
sys.exit(1)
# extract sample names from .cls file
cls_path = glob.glob(self.indir+'*/edb/*.cls')
if cls_path:
pos, neg, classes = gsea_cls_parser(cls_path[0])
else:
# logic for prerank results
pos, neg = '',''
# start reploting
self.gene_sets=gene_set_path
# obtain gene sets
gene_set_dict = self.parse_gmt(gmt=gene_set_path)
# obtain rank_metrics
rank_metric = self._load_ranking(rank_path)
correl_vector = rank_metric.values
gene_list = rank_metric.index.values
# extract each enriment term in the results.edb files and plot.
database = BeautifulSoup(open(results_path), features='xml')
length = len(database.findAll('DTG'))
fig_num = self.fignum if self.fignum <= length else length
for idx in range(fig_num):
def run(self):
"""GSEA main procedure"""
assert self.permutation_type in ["phenotype", "gene_set"]
assert self.min_size <= self.max_size
# Start Analysis
self._logger.info("Parsing data files for GSEA.............................")
# phenotype labels parsing
phenoPos, phenoNeg, cls_vector = gsea_cls_parser(self.classes)
# select correct expression genes and values.
dat = self.load_data(cls_vector)
# data frame must have length > 1
assert len(dat) > 1
# ranking metrics calculation.
dat2 = ranking_metric(df=dat, method=self.method, pos=phenoPos, neg=phenoNeg,
classes=cls_vector, ascending=self.ascending)
self.ranking = dat2
# filtering out gene sets and build gene sets dictionary
gmt = self.load_gmt(gene_list=dat2.index.values, gmt=self.gene_sets)
self._logger.info("%04d gene_sets used for further statistical testing....."% len(gmt))
self._logger.info("Start to run GSEA...Might take a while..................")
# cpu numbers
self._set_cores()
# compute ES, NES, pval, FDR, RES
:return: | a dictionary where key is a gene set and values are:
| { es: enrichment score,
| nes: normalized enrichment score,
| p: P-value,
| fdr: FDR,
| size: gene set size,
| matched_size: genes matched to the data,
| genes: gene names from the data set }
"""
assert len(data) > 1
assert permutation_type in ["phenotype", "gene_set"]
data = pd.read_table(data)
classes = gsea_cls_parser(cls)[2]
gmt = gsea_gmt_parser(gene_sets)
gmt.sort()
#Ecompute ES, NES, pval, FDR, RES
if rank_metric is None:
dat = ranking_metric(data,method= method,classes = classes ,ascending=ascending)
results,hit_ind,RES = gsea_compute(data = dat, gene_list = None,rankings = None,
n=permutation_n,gmt = gmt, weighted_score_type=weighted_score_type,
permutation_type=permutation_type)
else:
dat = pd.read_table(rank_metric)
results,hit_ind,RES = gsea_compute(data = None, gene_list = rank_metric['gene_name'],rankings = rank_metric['rank'].values,
n=permutation_n,gmt = gmt, weighted_score_type=weighted_score_type,
permutation_type=permutation_type)
res = {}