Source code for BRAD.enrichr

"""
Enrichr
-------

This module provides functionality to perform gene enrichment analysis using the `Enrichr <https://maayanlab.cloud/Enrichr/>`_ service. 
It includes functions to query the Enrichr database with a list of genes and retrieve enrichment results that can be displayed, saved, and plotted.

Available Methods
~~~~~~~~~~~~~~~~~

This module has the following methods:

"""
import pandas as pd
from copy import deepcopy
import os
import sys
import time
from importlib import reload
from scipy.stats import zscore
from scipy.stats import entropy
import scipy.io
import scipy
import textwrap
from scipy import sparse
import importlib
from itertools import product
from datetime import datetime
from IPython.display import display # displaying dataframes
import string
import warnings
import re
import matplotlib.pyplot as plt

# Bioinformatics
import gget

from BRAD import utils
from BRAD import log

# gene enrichment
[docs] def queryEnrichr(state, gene_list): """ Performs gene enrichment analysis using the Enrichr service and updates the chat status with the results. :param state: The current status of the chat, including the prompt, configuration, and process details. :type state: dict :raises FileNotFoundError: If the gene list or Enrichr databases file is not found. :raises ValueError: If the gene list or prompt contains invalid entries. :raises Warning: If multiple potential databases are provided or no database is specified. :return: The updated chat status dictionary containing the enrichment results and process details. :rtype: dict """ # Auth: Joshua Pickard # jpic@umich.edu # Date: June 6, 2024 prompt = state['prompt'] max_p_val = state['config']['max_enrichr_pval'] num_df_rows_display = state['config']['num_df_rows_display'] default_enrichr_db = state['config']['default_enrichr_db'] db, save, plot, dbfound = None, True, True, False # Remove any punctuation except for - and _, which are used in gget database names punctuation_to_remove = string.punctuation.replace('-', '').replace('_', '') translator = str.maketrans('', '', punctuation_to_remove) prompt = prompt.translate(translator) # Get list of gene names current_script_path = os.path.abspath(__file__) current_script_dir = os.path.dirname(current_script_path) file_path = os.path.join(current_script_dir, 'helperData', 'gene_list.txt') with open(file_path, "r") as file: g_from_file = [line.strip() for line in file] file_path = os.path.join(current_script_dir, 'helperData', 'ggetEnrichrDatabases.tsv') df = pd.read_csv(file_path, delimiter='\t') dbs = list(df['Gene-set Library'].values) dbs = [dbi.upper() for dbi in dbs] for gene in prompt.split(' '): if gene.upper() in dbs: if db is not None: warnings.warn('Two potential databases were provided!') else: db = gene dbfound = True if gene.upper() == 'PLOT': plot = True save = True if db is None: warnings.warn('warning: setting db to default') db = default_enrichr_db # query GO ax = None if plot: fig, ax = plt.subplots() start_time = time.time() edf = gget.enrichr(gene_list, database=db, plot=plot, ax=ax) state['process']['steps'].append( { 'genes' : str(gene_list), 'plot' : plot, 'save' : save, 'database' : db, 'default db' : dbfound, 'time' : time.time() - start_time } ) edf = edf[edf['p_val'] <= max_p_val] output = 'The following table was generated by quering the gene list against ' + db + ':' # display(edf[:num_df_rows_display].style) # print(f"{output=}") output += "\n\n---\n\n" displayDf = edf[['path_name', 'p_val', 'overlapping_genes']] filtered_df = displayDf[displayDf["p_val"] < 1e-5] # Filter rows where p_val < 10^-5 if len(filtered_df) < 10: filtered_df = displayDf.head(10) # If fewer than 10 rows, take the first 10 rows output += filtered_df.to_markdown() output += "\n\n---\n\n" # print(f"{output=}") state = log.userOutput(output, state=state) # if save: state = utils.save(state, edf, 'ENRICHR-' + db + '.csv') # if plot: # state = utils.savefig(state, ax, 'ENRICHR-' + db + '.png') plt.show() # print("exit enrichr method :)") return state