Source code for BRAD.utils

"""
This module provides a set of utility functions designed to streamline common tasks related to file management, 
data handling, directory operations, and more across the core and tool modules.

Scope
=====

The goal of this module is to offer a reusable, general-purpose utilities that simplify routine tasks that interface the LLM with other aspects of the
code. These tasks include saving and loading files, ensuring directories  exist, generating standardized file paths,
and more. Each function is designed to abstract repetitive operations and enhance code clarity, maintainability,
and reliability. The functions in this module can be imported as needed when building different aspects of the BRAD framework.




Available Methods
=================


This module contains the following methods:

"""

import re
import os
import time
import numpy as np
import pandas as pd
import subprocess
import difflib
import matplotlib.pyplot as plt
import shutil
from urllib.parse import urlparse

from langchain import PromptTemplate, LLMChain
from langchain_community.callbacks import get_openai_callback

from BRAD import log
from BRAD.promptTemplates import fileChooserTemplate, fieldChooserTemplate


[docs]
def save(state, data, name):
    """
    Save data to a specified output directory, with optional stage number prefix.

    This function saves the provided data to a specified output directory within 
    the `state` configuration. If the `state` is part of a pipeline, 
    it prefixes the filename with the current stage number.

    Args:
        state (dict): A dictionary containing the current chat status, 
                           including queued pipeline stages and output directory.
        data (pd.DataFrame or str): The data to be saved. It can be either a 
                                    pandas DataFrame (for CSV output) or a string (for .tex output).
        name (str): The name of the output file.

    Returns:
        dict: The updated `state` dictionary with information about the saved file.

    Raises:
        ValueError: If the data type is not a DataFrame for CSV or a string for .tex files.

    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 19, 2024

    # If this is part of a pipeline, then add the stage number to the printed output
    if len(state['queue']) != 0:
        stageNum = state['queue pointer'] + 1#[0]['order']
        name = 'S' + str(stageNum) + '-' + name
    output_path = os.path.join(state['output-directory'], name)

    if isinstance(data, pd.DataFrame):
        data.to_csv(output_path, index=False)
    elif output_path.endswith('.tex'):
        with open(output_path, 'w') as file:
            file.write(data)
    else:
        raise ValueError("Unsupported data type or file extension. Use a DataFrame for CSV or a string for .tex files.")
    
    log.debugLog('The information has been saved to: ' + output_path, state=state)
    state['process']['steps'].append(
        {
            'func'     : 'utils.save',
            'new file' : output_path
        }
    )
    return state



[docs]
def savefig(state, ax, name):
    """
    Save a matplotlib figure to a specified output directory, with optional stage number prefix.

    This function saves the provided matplotlib axis (`ax`) as a figure to a specified 
    output directory within the `state` configuration. If the `state` is part 
    of a pipeline, it prefixes the filename with the current stage number.

    Args:
        state (dict): A dictionary containing the current chat status, including 
                           queued pipeline stages and output directory.
        ax (matplotlib.axes.Axes): The matplotlib axis object containing the figure to be saved.
        name (str): The name of the output file.

    Returns:
        dict: The updated `state` dictionary with information about the saved file.

    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 19, 2024
    log.debugLog("SAVEFIG", state=state)
    if len(state['queue']) != 0:
        stageNum = state['queue pointer'] + 1 # [0]['order']
        name = 'S' + str(stageNum) + '-' + name
    output_path = os.path.join(state['output-directory'], state['config']['image-path-extension'], name)
    ensure_directory_exists(output_path, state)
    plt.savefig(output_path)
    log.debugLog('The image was saved to: ' + output_path, state=state)
    state['process']['steps'].append(
        {
            'func'     : 'utils.savefig',
            'new file' : output_path
        }
    )
    return state



[docs]
def ensure_directory_exists(file_path, state):
    """
    Ensure that the directory for a given file path exists, creating it if necessary.

    This function checks if the directory path for the provided `file_path` exists.
    If the directory does not exist, it creates the directory. It prints a message
    indicating whether the directory was created or if it already existed.

    Args:
        file_path (str): The full file path for which the directory needs to be checked/created.

    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 23, 2024
    directory_path = os.path.dirname(file_path)
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
        log.debugLog(f"Directory '{directory_path}' created.", state=state)
    else:
        log.debugLog(f"Directory '{directory_path}' already exists.", state=state)




[docs]
def pdfDownloadPath(state):
    """
    Generate the file path for downloading PDF files.

    This function constructs the file path for downloading PDF files based on the 
    `output-directory` specified in the `state` dictionary. It appends 'pdf'
    to the output directory path to indicate the location where PDF files should be saved.

    Args:
        state (dict): A dictionary containing the chat status and configuration details.
                           It must include the key 'output-directory'.

    Returns:
        str: The complete file path for downloading PDF files.

    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 19, 2024
    path = os.path.join(state['output-directory'], 'pdf')
    return path



[docs]
def outputFiles(state):
    """
    Retrieve a list of all files in the output directory.

    This function lists all files present in the `output-directory` specified in the 
    `state` dictionary and returns them as a list.

    Args:
        state (dict): A dictionary containing the chat status and configuration details.
                           It must include the key 'output-directory'.

    Returns:
        list: A list of filenames present in the output directory.

    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 19, 2024
    output_files = []
    for filename in os.listdir(state['output-directory']):
        output_files.append(filename)
    return output_files



[docs]
def makeNamesConsistent(state, files):
    """
    Ensure filenames in the output directory are consistent with the pipeline stage numbering.

    This function renames files in the output directory to include the current stage number
    from the pipeline. If a file's name does not start with 'S', it will be prefixed with the 
    stage number. Additionally, it removes any '/' or '\\' characters from filenames.

    Args:
        state (dict): A dictionary containing the chat status and configuration details.
                           It must include the keys 'queue' and 'output-directory'.
        files (list): A list of filenames to be processed.

    Returns:
        dict: Updated state with renamed files logged in 'process' steps.

    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 19, 2024

    # Dev. Comments:
    # -------------------
    # This function executes a single user prompt with BRAD
    #
    # Issues:
    # - It is not clear why there are 2 for loops that renamd files
    #
    if len(state['queue']) != 0:
        log.debugLog('Finding Stage Number of Pipeline', state=state)
        log.debugLog(state['queue'], state=state)
        IP = state['queue pointer'] # [0]['order'] + 1
        IP = int(IP)
    else:
        return
    renamedFiles = []
    log.debugLog(f"{IP=}", state=state)
    log.debugLog(f"{type(IP)=}", state=state)
    for file in files:
        if file[0] != 'S':
            old_path = os.path.join(state['output-directory'], file)
            if os.path.isdir(old_path):
                continue
            new_path = os.path.join(state['output-directory'], 'S' + str(IP) + '-' + file)
            renamedFiles.append(
                {
                    'old-name' : old_path,
                    'new-name' : new_path
                }
            )
            os.rename(old_path, new_path)
            if 'output' not in state['queue'][IP].keys():
                state['queue'][IP] = []
            state['queue'][IP]['output'].append(new_path)
    for file in outputFiles(state):
        old_path = os.path.join(state['output-directory'], file)
        new_path = os.path.join(state['output-directory'], file.replace('/', '').replace('\\', ''))
        if old_path != new_path:
            renamedFiles.append(
                {
                    'old-name' : old_path,
                    'new-name' : new_path
                }
            )
            os.rename(old_path, new_path)
            if 'output' not in state['queue'][IP].keys():
                state['queue'][IP] = []
            state['queue'][IP]['output'].append(new_path)
    state['process']['steps'].append(
        {
            'func'  : 'utils.makeNamesConsistent',
            'files' : renamedFiles
        }
    )
    return state



[docs]
def loadFromFile(state):
    """
    Loads data from a file selected by an LLM prompt based on user input.

    This function interacts with a language model to select a file from available files
    in the output directory. It extracts the specified fields from the selected file 
    and returns the data along with updated chat status.

    Args:
        state (dict): A dictionary containing the chat status and configuration details.
                           It must include the keys 'prompt', 'llm', and 'output-directory'.

    Returns:
        tuple: Updated state dictionary and a list of values from the specified fields in the file.

    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 19, 2024
    prompt = state['prompt']
    llm    = state['llm']
    # Get files to choose from
    availableFilesList = outputFiles(state)
    availableFiles = '\n'.join(availableFilesList)
    log.debugLog(availableFiles, state=state)
    
    # Build lang chain
    template = fileChooserTemplate()
    template = template.format(files=availableFiles)
    log.debugLog(template, state=state)
    PROMPT   = PromptTemplate(input_variables=["user_query"], template=template)
    chain    = PROMPT | llm

    # Call chain
    state   = log.userOutput(prompt, state=state)
    start_time = time.time()
    with get_openai_callback() as cb:
        responseFull = chain.invoke(prompt)
    response = responseFull.content.strip()
    responseFull = {'content': responseFull}
    responseFull['time'] = time.time() - start_time
    responseFull['call back'] = {
            "Total Tokens": cb.total_tokens,
            "Prompt Tokens": cb.prompt_tokens,
            "Completion Tokens": cb.completion_tokens,
            "Total Cost (USD)": cb.total_cost
    }
    
    # Regular expressions to extract file and fields
    file_pattern = r"File:\s*(\S+)"
    fields_pattern = r"Fields:\s*(.+)"

    # Search for patterns in the response
    file_match = re.search(file_pattern, response)
    fields_match = re.search(fields_pattern, response)

    # Extract the matched values
    file = file_match.group(1) if file_match else None
    fields = fields_match.group(1) if fields_match else None

    # Find the file that is most similar to the extracted file
    scores = []
    for availableFile in availableFilesList:
        scores.append(word_similarity(file, availableFile))
    file = availableFilesList[np.argmax(scores)]
    
    log.debugLog('File=' + str(file) + '\n' + 'Fields=' + str(fields), state=state)
    state['process']['steps'].append(
        log.llmCallLog(
            llm          = llm,
            prompt       = PROMPT,
            input        = prompt,
            output       = responseFull,
            parsedOutput = {
                'File'   : file,
                'Fields' : fields
            },
            purpose      = 'Select File'
        )
    )
    
    # Determine the delimiter based on the file extension
    delimiter = ',' if not file.endswith('.tsv') else '\t'
    
    # Read the file into a DataFrame
    loadfile = os.path.join(state['output-directory'], file)
    df = pd.read_csv(loadfile, delimiter=delimiter)
    state['process']['steps'].append(log.loadFileLog(file      = loadfile,
                                                          delimiter = delimiter)
                                         )

    if fields not in df.columns:
        state, fields = fieldSelectorFromDataFrame(state, df)

    return state, list(df[fields].values)



[docs]
def fieldSelectorFromDataFrame(state, df):
    """
    Selects a field from a DataFrame using a language model prompt.

    This function uses a language model to select a specific field from the columns of a given DataFrame.
    It builds a prompt with the available columns, invokes the language model, and parses the response to
    determine the selected field.

    Args:
        state (dict): A dictionary containing the chat status and configuration details.
                           It must include the keys 'llm', 'prompt', and 'process'.
        df (pandas.DataFrame): The DataFrame from which a field will be selected.

    Returns:
        tuple: Updated state dictionary and the selected field as a string.

    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 19, 2024
    llm      = state['llm']
    prompt   = state['prompt']
    template = fieldChooserTemplate()
    template = template.format(columns=', '.join(list(df.columns)))
    PROMPT   = PromptTemplate(input_variables=["user_query"], template=template)
    chain    = PROMPT | llm

    # Call chain
    response = chain.invoke(prompt).content.strip()
    fields = response.split('=')[1].strip()
    state['process']['steps'].append(log.llmCallLog(llm          = llm,
                                                         prompt       = PROMPT,
                                                         input        = prompt,
                                                         output       = response,
                                                         parsedOutput = {
                                                             'Fields' : fields
                                                         },
                                                         purpose      = 'Select Field'
                                                        )
                                        )

    log.debugLog('field identifier response=\n'+fields, state=state)
    return state, fields



[docs]
def word_similarity(word1, word2):
    """
    Calculate the similarity ratio between two words using SequenceMatcher.

    This function computes the similarity ratio between two input words. The ratio is calculated
    based on the longest contiguous matching subsequence between the two words using the
    `difflib.SequenceMatcher` from the Python standard library.


    :param word1: The first word to compare.
    :type word1: str
    :param word2: The second word to compare.
    :type word2: str
    
    :return: A float value between 0 and 1 representing the similarity ratio. A value of 1.0 means the words
             are identical, while 0.0 means they are completely different.
    :rtype: (float)

    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 23, 2024
    return difflib.SequenceMatcher(None, word1, word2).ratio()



[docs]
def outputFromPriorStep(state, step, values=None):
    """
    Retrieve the output from a prior step in the pipeline.

    .. warning:: We may be removing this function soon.

    This function searches for and loads the output file corresponding to a specified step in the pipeline.
    If the file is a CSV, it loads the data into a DataFrame. Optionally, specific columns can be selected from the DataFrame.

    Args:
        state (dict): The dictionary containing the current status and configuration of the chat, including the output directory.
        step (str): The step number as a string to identify the specific output file.
        values (list, optional): A list of column names to select from the DataFrame. If None, all columns are returned.

    Returns:
        pandas.DataFrame: The DataFrame containing the data from the output file of the specified step. If specific columns are provided, only those columns are included.

    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 19, 2024
    log.debugLog(state, state=state)
    log.debugLog(step, state=state)
    step_output_files = []
    file = None
    for filename in os.listdir(state['output-directory']):
        if filename.startswith('S'):
            step_output_files.append(filename)
        if filename.startswith('S' + step):
            file = filename
    state = log.userOutput(file, state=state)
    if file.endswith('.csv'):
        file_path = os.path.join(state['output-directory'], file)
        df = pd.read_csv(file_path)
        state = log.userOutput(df, state=state)
        if values is not None:
            df = df[values]
    return df



[docs]
def compile_latex_to_pdf(state, tex_file):
    """
    Compile a LaTeX (.tex) file into a PDF using pdflatex.

    This function compiles a LaTeX file into a PDF by running pdflatex command with the specified output directory.

    Args:
        state (dict): The dictionary containing the current status and configuration of the chat, including the output directory.
        tex_file (str): The filename of the LaTeX file (including the .tex extension) to compile.

    Returns:
        dict: Updated state dictionary after attempting to compile the LaTeX file.

    Raises:
        FileNotFoundError: If the specified LaTeX file does not exist.

    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 23, 2024
    tex_file_path = os.path.join(state['output-directory'], tex_file)
    
    # Ensure the file exists
    if not os.path.isfile(tex_file_path):
        raise FileNotFoundError(f"The file {tex_file_path} does not exist.")
    
    # Run the pdflatex command with the specified output directory
    try:
        subprocess.run(
            ['pdflatex', '-output-directory', state['output-directory'], tex_file_path], 
            check=True
        )
        log.debugLog(f"PDF generated successfully in {state['output-directory']}.", state=state)
        state['process']['steps'].append(
            {
                'func' : 'utils.compile_latex_to_pdf',
                'what' : 'tried to compile latex to a pdf'
            }
        )
    except subprocess.CalledProcessError as e:
        log.debugLog(f"An error occurred: {e}", state=state)
        state['process']['steps'].append(
            {
                'func' : 'utils.compile_latex_to_pdf',
                'what' : 'failed to compile latex to a pdf'
            }
        )        
    return state



[docs]
def add_output_file_path_to_string(string, state):
    """
    Modifies the given string to include the appropriate file paths for any files 
    previously generated by BRAD. If a file from the generated files list is found 
    in the string, and it is not immediately preceded by the append path, the 
    function inserts the append path before the file name.

    Parameters:
        string (str): The input string to be modified.
        state (dict): A dictionary containing chat status information, including 
                           'output-path' and a function outputFiles that returns a list 
                           of generated file names.

    Returns:
        str: The modified string with appropriate file paths included.
    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 30, 2024
    
    # Retrieve the list of generated files and the output path
    generated_files = outputFiles(state)  # Returns a list of strings each indicating a file name
    append_path = state['output-directory']

    # Check and modify the string if necessary
    for file in generated_files:
        if file in string:
            fileWpath = os.path.join(append_path, file)
            if fileWpath not in string:
                string = string.replace(file, fileWpath)
                log.debugLog("Replacing: " + file + ' with ' + fileWpath, state=state)
                log.debugLog("New String: " + str(string), state=state)
    return string



[docs]
def load_file_to_dataframe(filename):
    """
    Load a file into a Pandas DataFrame based on its extension.

    This function reads a CSV or TSV file into a Pandas DataFrame based on the file extension.

    Parameters
    ----------
        filename (str): The path to the file to load.

    Returns
    -------
        pd.DataFrame or None: The loaded DataFrame if successful, or None if the file extension is not supported.

    """
    # Auth: Joshua Pickard
    #       jpic@umich.edu
    # Date: June 23, 2024

    # Determine the file extension
    _, file_extension = os.path.splitext(filename)
    
    if file_extension.lower() == '.csv':
        df = pd.read_csv(filename)
    elif file_extension.lower() == '.tsv':
        df = pd.read_csv(filename, delimiter='\t')
    else:
        return None
    
    return df



[docs]
def find_integer_in_string(text):
    # Find all sequences of digits in the text
    match = re.search(r'\d+', text)
    
    if match:
        # Convert the found string to an integer
        return int(match.group(0))
    else:
        # Return None if no integer is found
        return None



[docs]
def delete_dirs_without_log(agent):
    directory = agent.state['config'].get('log_path')
    # List only first-level subdirectories
    for subdir in os.listdir(directory):
        subdir_path = os.path.join(directory, subdir)
        
        # Check if it's a directory
        if os.path.isdir(subdir_path):
            log_file_path = os.path.join(subdir_path, 'log.json')
            
            # If log.json does not exist in the subdirectory, delete the subdirectory
            if not os.path.exists(log_file_path):
                shutil.rmtree(subdir_path)  # Recursively delete directory and its contents
                print(f"Deleted directory: {subdir_path}")





[docs]
def strip_root_path(url, root_path):
    """Strips the root path from a URL."""

    parsed_url = urlparse(url)
    parsed_root = urlparse(root_path)

    if parsed_url.path.startswith(parsed_root.path):
        return parsed_url.geturl().replace(parsed_root.path, '', 1)
    else:
        return url