Source code for BRAD.enrichr

"""
Enrichr
-------

This module provides functionality to perform gene enrichment analysis using the `Enrichr <https://maayanlab.cloud/Enrichr/>`_ service. 
It includes functions to query the Enrichr database with a list of genes and retrieve enrichment results that can be displayed, saved, and plotted.

Available Methods
~~~~~~~~~~~~~~~~~

This module has the following methods:

"""
import pandas as pd
from copy import deepcopy
import os
import sys
import time
from importlib import reload
from scipy.stats import zscore
from scipy.stats import entropy
import scipy.io
import scipy
import textwrap
from scipy import sparse
import importlib
from itertools import product
from datetime import datetime
from IPython.display import display # displaying dataframes
import string
import warnings
import re
import matplotlib.pyplot as plt

# Bioinformatics
import gget

from BRAD import utils
from BRAD import log

# gene enrichment

[docs]
def queryEnrichr(state, gene_list):
    """
    Performs gene enrichment analysis using the Enrichr service and updates the chat status with the results.

    :param state: The current status of the chat, including the prompt, configuration, and process details.
    :type state: dict

    :raises FileNotFoundError: If the gene list or Enrichr databases file is not found.
    :raises ValueError: If the gene list or prompt contains invalid entries.
    :raises Warning: If multiple potential databases are provided or no database is specified.

    :return: The updated chat status dictionary containing the enrichment results and process details.
    :rtype: dict
    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 6, 2024
    prompt              = state['prompt']
    max_p_val           = state['config']['max_enrichr_pval']
    num_df_rows_display = state['config']['num_df_rows_display']
    default_enrichr_db  = state['config']['default_enrichr_db']
    
    db, save, plot, dbfound = None, True, True, False

    # Remove any punctuation except for - and _, which are used in gget database names
    punctuation_to_remove = string.punctuation.replace('-', '').replace('_', '')
    translator = str.maketrans('', '', punctuation_to_remove)
    prompt = prompt.translate(translator)
    
    # Get list of gene names
    current_script_path = os.path.abspath(__file__)
    current_script_dir = os.path.dirname(current_script_path)
    file_path = os.path.join(current_script_dir, 'helperData', 'gene_list.txt')
    with open(file_path, "r") as file:
        g_from_file = [line.strip() for line in file]
    file_path = os.path.join(current_script_dir, 'helperData', 'ggetEnrichrDatabases.tsv')
    df = pd.read_csv(file_path, delimiter='\t')
    dbs = list(df['Gene-set Library'].values)
    dbs = [dbi.upper() for dbi in dbs]
    for gene in prompt.split(' '):
        if gene.upper() in dbs:
            if db is not None:
                warnings.warn('Two potential databases were provided!')
            else:
                db = gene
                dbfound = True
        if gene.upper() == 'PLOT':
            plot = True
    save = True
    if db is None:
        warnings.warn('warning: setting db to default')
        db = default_enrichr_db

    # query GO
    ax = None
    if plot:
        fig, ax = plt.subplots()
    start_time = time.time()
    edf = gget.enrichr(gene_list, database=db, plot=plot, ax=ax)
    state['process']['steps'].append(
        {
            'genes'      : str(gene_list),
            'plot'       : plot,
            'save'       : save,
            'database'   : db,
            'default db' : dbfound,
            'time' : time.time() - start_time
        }
    )

    edf = edf[edf['p_val'] <= max_p_val]
    
    output = 'The following table was generated by quering the gene list against ' + db + ':'
    # display(edf[:num_df_rows_display].style)
#    print(f"{output=}")
    output += "\n\n---\n\n"
    displayDf = edf[['path_name', 'p_val', 'overlapping_genes']]
    filtered_df = displayDf[displayDf["p_val"] < 1e-5]  # Filter rows where p_val < 10^-5
    if len(filtered_df) < 10:
        filtered_df = displayDf.head(10)  # If fewer than 10 rows, take the first 10 rows
    output += filtered_df.to_markdown()
    output += "\n\n---\n\n"
#    print(f"{output=}")
    state = log.userOutput(output, state=state)
#    if save:
    state = utils.save(state, edf, 'ENRICHR-' + db + '.csv')

#    if plot:
#    state = utils.savefig(state, ax, 'ENRICHR-' + db + '.png')
    plt.show()
#    print("exit enrichr method :)")
    return state