Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Check size of df
assert previous_df_shape[0] == enrichment_score.shape[0] + len(drop_indices), 'Problem removing random rows'
assert previous_df_shape[1] == enrichment_score.shape[1], 'Columns should have not changed after removing rows'
# Transpose dataFrame to arrange columns as pathways and rows as genes
enrichment_score_df = enrichment_score.transpose()
# Set column index to the first row in the dataframe
enrichment_score_df.columns = enrichment_score_df.iloc[0]
# Remove the first row because it is already set as column index
enrichment_score_df = enrichment_score_df.drop("Term|NES")
# Get class labels
_, _, class_vector = gseapy.parser.gsea_cls_parser(classes_file)
class_labels = []
for label in class_vector:
if label == 'Normal':
class_labels.append(0)
elif label == 'Tumor':
class_labels.append(1)
# Get list of pathways as features
feature_cols = list(enrichment_score_df.columns.values)
# split dataset into features and target variable (i.e., normal vs tumor sample labels)
pathways = enrichment_score_df[feature_cols] # Features
pathways.reset_index(drop=True, inplace=True)
def parse_class_vector(path):
return gp.parser.gsea_cls_parser(path)
def filter_gene_exp_data(expression_data: pd.DataFrame, gmt_file: str):
"""Filter gene expression data file to include only gene names which are found in the gene set files.
:param expression_data: gene expression values for samples
:param gmt_file: .gmt file containing gene sets
:return: Filtered gene expression data with genes with no correspondences in gene sets removed
:rtype: pandas.core.frame.DataFrame kegg_xml_parser.py
"""
filtered_expression_data = expression_data.copy()
# Gene universe from gene set
gene_sets = gseapy.parser.gsea_gmt_parser(gmt_file, max_size=40000)
# All the genes in gene set files
gene_universe = set(itt.chain(*gene_sets.values()))
genes_to_remove = [
gene
for gene in filtered_expression_data.index.values
if gene not in gene_universe
]
# Genes to be removed because they are not present in the gene sets
counter = len(genes_to_remove)
logger.info(f'Expression data has {len(filtered_expression_data.index.values)}')
logger.info(f'Gene universe has {len(gene_universe)}')
logger.info(f'{counter} were removed in expression data')
logger.info(