"""
This module provides a set of utility functions designed to streamline common tasks related to file management,
data handling, directory operations, and more across the core and tool modules.
Scope
=====
The goal of this module is to offer a reusable, general-purpose utilities that simplify routine tasks that interface the LLM with other aspects of the
code. These tasks include saving and loading files, ensuring directories exist, generating standardized file paths,
and more. Each function is designed to abstract repetitive operations and enhance code clarity, maintainability,
and reliability. The functions in this module can be imported as needed when building different aspects of the BRAD framework.
Available Methods
=================
This module contains the following methods:
"""
import re
import os
import time
import numpy as np
import pandas as pd
import subprocess
import difflib
import matplotlib.pyplot as plt
import shutil
from urllib.parse import urlparse
from langchain import PromptTemplate, LLMChain
from langchain_community.callbacks import get_openai_callback
from BRAD import log
from BRAD.promptTemplates import fileChooserTemplate, fieldChooserTemplate
[docs]
def save(state, data, name):
"""
Save data to a specified output directory, with optional stage number prefix.
This function saves the provided data to a specified output directory within
the `state` configuration. If the `state` is part of a pipeline,
it prefixes the filename with the current stage number.
Args:
state (dict): A dictionary containing the current chat status,
including queued pipeline stages and output directory.
data (pd.DataFrame or str): The data to be saved. It can be either a
pandas DataFrame (for CSV output) or a string (for .tex output).
name (str): The name of the output file.
Returns:
dict: The updated `state` dictionary with information about the saved file.
Raises:
ValueError: If the data type is not a DataFrame for CSV or a string for .tex files.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 19, 2024
# If this is part of a pipeline, then add the stage number to the printed output
if len(state['queue']) != 0:
stageNum = state['queue pointer'] + 1#[0]['order']
name = 'S' + str(stageNum) + '-' + name
output_path = os.path.join(state['output-directory'], name)
if isinstance(data, pd.DataFrame):
data.to_csv(output_path, index=False)
elif output_path.endswith('.tex'):
with open(output_path, 'w') as file:
file.write(data)
else:
raise ValueError("Unsupported data type or file extension. Use a DataFrame for CSV or a string for .tex files.")
log.debugLog('The information has been saved to: ' + output_path, state=state)
state['process']['steps'].append(
{
'func' : 'utils.save',
'new file' : output_path
}
)
return state
[docs]
def savefig(state, ax, name):
"""
Save a matplotlib figure to a specified output directory, with optional stage number prefix.
This function saves the provided matplotlib axis (`ax`) as a figure to a specified
output directory within the `state` configuration. If the `state` is part
of a pipeline, it prefixes the filename with the current stage number.
Args:
state (dict): A dictionary containing the current chat status, including
queued pipeline stages and output directory.
ax (matplotlib.axes.Axes): The matplotlib axis object containing the figure to be saved.
name (str): The name of the output file.
Returns:
dict: The updated `state` dictionary with information about the saved file.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 19, 2024
log.debugLog("SAVEFIG", state=state)
if len(state['queue']) != 0:
stageNum = state['queue pointer'] + 1 # [0]['order']
name = 'S' + str(stageNum) + '-' + name
output_path = os.path.join(state['output-directory'], state['config']['image-path-extension'], name)
ensure_directory_exists(output_path, state)
plt.savefig(output_path)
log.debugLog('The image was saved to: ' + output_path, state=state)
state['process']['steps'].append(
{
'func' : 'utils.savefig',
'new file' : output_path
}
)
return state
[docs]
def ensure_directory_exists(file_path, state):
"""
Ensure that the directory for a given file path exists, creating it if necessary.
This function checks if the directory path for the provided `file_path` exists.
If the directory does not exist, it creates the directory. It prints a message
indicating whether the directory was created or if it already existed.
Args:
file_path (str): The full file path for which the directory needs to be checked/created.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 23, 2024
directory_path = os.path.dirname(file_path)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
log.debugLog(f"Directory '{directory_path}' created.", state=state)
else:
log.debugLog(f"Directory '{directory_path}' already exists.", state=state)
[docs]
def pdfDownloadPath(state):
"""
Generate the file path for downloading PDF files.
This function constructs the file path for downloading PDF files based on the
`output-directory` specified in the `state` dictionary. It appends 'pdf'
to the output directory path to indicate the location where PDF files should be saved.
Args:
state (dict): A dictionary containing the chat status and configuration details.
It must include the key 'output-directory'.
Returns:
str: The complete file path for downloading PDF files.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 19, 2024
path = os.path.join(state['output-directory'], 'pdf')
return path
[docs]
def outputFiles(state):
"""
Retrieve a list of all files in the output directory.
This function lists all files present in the `output-directory` specified in the
`state` dictionary and returns them as a list.
Args:
state (dict): A dictionary containing the chat status and configuration details.
It must include the key 'output-directory'.
Returns:
list: A list of filenames present in the output directory.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 19, 2024
output_files = []
for filename in os.listdir(state['output-directory']):
output_files.append(filename)
return output_files
[docs]
def makeNamesConsistent(state, files):
"""
Ensure filenames in the output directory are consistent with the pipeline stage numbering.
This function renames files in the output directory to include the current stage number
from the pipeline. If a file's name does not start with 'S', it will be prefixed with the
stage number. Additionally, it removes any '/' or '\\' characters from filenames.
Args:
state (dict): A dictionary containing the chat status and configuration details.
It must include the keys 'queue' and 'output-directory'.
files (list): A list of filenames to be processed.
Returns:
dict: Updated state with renamed files logged in 'process' steps.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 19, 2024
# Dev. Comments:
# -------------------
# This function executes a single user prompt with BRAD
#
# Issues:
# - It is not clear why there are 2 for loops that renamd files
#
if len(state['queue']) != 0:
log.debugLog('Finding Stage Number of Pipeline', state=state)
log.debugLog(state['queue'], state=state)
IP = state['queue pointer'] # [0]['order'] + 1
IP = int(IP)
else:
return
renamedFiles = []
log.debugLog(f"{IP=}", state=state)
log.debugLog(f"{type(IP)=}", state=state)
for file in files:
if file[0] != 'S':
old_path = os.path.join(state['output-directory'], file)
if os.path.isdir(old_path):
continue
new_path = os.path.join(state['output-directory'], 'S' + str(IP) + '-' + file)
renamedFiles.append(
{
'old-name' : old_path,
'new-name' : new_path
}
)
os.rename(old_path, new_path)
if 'output' not in state['queue'][IP].keys():
state['queue'][IP] = []
state['queue'][IP]['output'].append(new_path)
for file in outputFiles(state):
old_path = os.path.join(state['output-directory'], file)
new_path = os.path.join(state['output-directory'], file.replace('/', '').replace('\\', ''))
if old_path != new_path:
renamedFiles.append(
{
'old-name' : old_path,
'new-name' : new_path
}
)
os.rename(old_path, new_path)
if 'output' not in state['queue'][IP].keys():
state['queue'][IP] = []
state['queue'][IP]['output'].append(new_path)
state['process']['steps'].append(
{
'func' : 'utils.makeNamesConsistent',
'files' : renamedFiles
}
)
return state
[docs]
def loadFromFile(state):
"""
Loads data from a file selected by an LLM prompt based on user input.
This function interacts with a language model to select a file from available files
in the output directory. It extracts the specified fields from the selected file
and returns the data along with updated chat status.
Args:
state (dict): A dictionary containing the chat status and configuration details.
It must include the keys 'prompt', 'llm', and 'output-directory'.
Returns:
tuple: Updated state dictionary and a list of values from the specified fields in the file.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 19, 2024
prompt = state['prompt']
llm = state['llm']
# Get files to choose from
availableFilesList = outputFiles(state)
availableFiles = '\n'.join(availableFilesList)
log.debugLog(availableFiles, state=state)
# Build lang chain
template = fileChooserTemplate()
template = template.format(files=availableFiles)
log.debugLog(template, state=state)
PROMPT = PromptTemplate(input_variables=["user_query"], template=template)
chain = PROMPT | llm
# Call chain
state = log.userOutput(prompt, state=state)
start_time = time.time()
with get_openai_callback() as cb:
responseFull = chain.invoke(prompt)
response = responseFull.content.strip()
responseFull = {'content': responseFull}
responseFull['time'] = time.time() - start_time
responseFull['call back'] = {
"Total Tokens": cb.total_tokens,
"Prompt Tokens": cb.prompt_tokens,
"Completion Tokens": cb.completion_tokens,
"Total Cost (USD)": cb.total_cost
}
# Regular expressions to extract file and fields
file_pattern = r"File:\s*(\S+)"
fields_pattern = r"Fields:\s*(.+)"
# Search for patterns in the response
file_match = re.search(file_pattern, response)
fields_match = re.search(fields_pattern, response)
# Extract the matched values
file = file_match.group(1) if file_match else None
fields = fields_match.group(1) if fields_match else None
# Find the file that is most similar to the extracted file
scores = []
for availableFile in availableFilesList:
scores.append(word_similarity(file, availableFile))
file = availableFilesList[np.argmax(scores)]
log.debugLog('File=' + str(file) + '\n' + 'Fields=' + str(fields), state=state)
state['process']['steps'].append(
log.llmCallLog(
llm = llm,
prompt = PROMPT,
input = prompt,
output = responseFull,
parsedOutput = {
'File' : file,
'Fields' : fields
},
purpose = 'Select File'
)
)
# Determine the delimiter based on the file extension
delimiter = ',' if not file.endswith('.tsv') else '\t'
# Read the file into a DataFrame
loadfile = os.path.join(state['output-directory'], file)
df = pd.read_csv(loadfile, delimiter=delimiter)
state['process']['steps'].append(log.loadFileLog(file = loadfile,
delimiter = delimiter)
)
if fields not in df.columns:
state, fields = fieldSelectorFromDataFrame(state, df)
return state, list(df[fields].values)
[docs]
def fieldSelectorFromDataFrame(state, df):
"""
Selects a field from a DataFrame using a language model prompt.
This function uses a language model to select a specific field from the columns of a given DataFrame.
It builds a prompt with the available columns, invokes the language model, and parses the response to
determine the selected field.
Args:
state (dict): A dictionary containing the chat status and configuration details.
It must include the keys 'llm', 'prompt', and 'process'.
df (pandas.DataFrame): The DataFrame from which a field will be selected.
Returns:
tuple: Updated state dictionary and the selected field as a string.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 19, 2024
llm = state['llm']
prompt = state['prompt']
template = fieldChooserTemplate()
template = template.format(columns=', '.join(list(df.columns)))
PROMPT = PromptTemplate(input_variables=["user_query"], template=template)
chain = PROMPT | llm
# Call chain
response = chain.invoke(prompt).content.strip()
fields = response.split('=')[1].strip()
state['process']['steps'].append(log.llmCallLog(llm = llm,
prompt = PROMPT,
input = prompt,
output = response,
parsedOutput = {
'Fields' : fields
},
purpose = 'Select Field'
)
)
log.debugLog('field identifier response=\n'+fields, state=state)
return state, fields
[docs]
def word_similarity(word1, word2):
"""
Calculate the similarity ratio between two words using SequenceMatcher.
This function computes the similarity ratio between two input words. The ratio is calculated
based on the longest contiguous matching subsequence between the two words using the
`difflib.SequenceMatcher` from the Python standard library.
:param word1: The first word to compare.
:type word1: str
:param word2: The second word to compare.
:type word2: str
:return: A float value between 0 and 1 representing the similarity ratio. A value of 1.0 means the words
are identical, while 0.0 means they are completely different.
:rtype: (float)
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 23, 2024
return difflib.SequenceMatcher(None, word1, word2).ratio()
[docs]
def outputFromPriorStep(state, step, values=None):
"""
Retrieve the output from a prior step in the pipeline.
.. warning:: We may be removing this function soon.
This function searches for and loads the output file corresponding to a specified step in the pipeline.
If the file is a CSV, it loads the data into a DataFrame. Optionally, specific columns can be selected from the DataFrame.
Args:
state (dict): The dictionary containing the current status and configuration of the chat, including the output directory.
step (str): The step number as a string to identify the specific output file.
values (list, optional): A list of column names to select from the DataFrame. If None, all columns are returned.
Returns:
pandas.DataFrame: The DataFrame containing the data from the output file of the specified step. If specific columns are provided, only those columns are included.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 19, 2024
log.debugLog(state, state=state)
log.debugLog(step, state=state)
step_output_files = []
file = None
for filename in os.listdir(state['output-directory']):
if filename.startswith('S'):
step_output_files.append(filename)
if filename.startswith('S' + step):
file = filename
state = log.userOutput(file, state=state)
if file.endswith('.csv'):
file_path = os.path.join(state['output-directory'], file)
df = pd.read_csv(file_path)
state = log.userOutput(df, state=state)
if values is not None:
df = df[values]
return df
[docs]
def compile_latex_to_pdf(state, tex_file):
"""
Compile a LaTeX (.tex) file into a PDF using pdflatex.
This function compiles a LaTeX file into a PDF by running pdflatex command with the specified output directory.
Args:
state (dict): The dictionary containing the current status and configuration of the chat, including the output directory.
tex_file (str): The filename of the LaTeX file (including the .tex extension) to compile.
Returns:
dict: Updated state dictionary after attempting to compile the LaTeX file.
Raises:
FileNotFoundError: If the specified LaTeX file does not exist.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 23, 2024
tex_file_path = os.path.join(state['output-directory'], tex_file)
# Ensure the file exists
if not os.path.isfile(tex_file_path):
raise FileNotFoundError(f"The file {tex_file_path} does not exist.")
# Run the pdflatex command with the specified output directory
try:
subprocess.run(
['pdflatex', '-output-directory', state['output-directory'], tex_file_path],
check=True
)
log.debugLog(f"PDF generated successfully in {state['output-directory']}.", state=state)
state['process']['steps'].append(
{
'func' : 'utils.compile_latex_to_pdf',
'what' : 'tried to compile latex to a pdf'
}
)
except subprocess.CalledProcessError as e:
log.debugLog(f"An error occurred: {e}", state=state)
state['process']['steps'].append(
{
'func' : 'utils.compile_latex_to_pdf',
'what' : 'failed to compile latex to a pdf'
}
)
return state
[docs]
def add_output_file_path_to_string(string, state):
"""
Modifies the given string to include the appropriate file paths for any files
previously generated by BRAD. If a file from the generated files list is found
in the string, and it is not immediately preceded by the append path, the
function inserts the append path before the file name.
Parameters:
string (str): The input string to be modified.
state (dict): A dictionary containing chat status information, including
'output-path' and a function outputFiles that returns a list
of generated file names.
Returns:
str: The modified string with appropriate file paths included.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 30, 2024
# Retrieve the list of generated files and the output path
generated_files = outputFiles(state) # Returns a list of strings each indicating a file name
append_path = state['output-directory']
# Check and modify the string if necessary
for file in generated_files:
if file in string:
fileWpath = os.path.join(append_path, file)
if fileWpath not in string:
string = string.replace(file, fileWpath)
log.debugLog("Replacing: " + file + ' with ' + fileWpath, state=state)
log.debugLog("New String: " + str(string), state=state)
return string
[docs]
def load_file_to_dataframe(filename):
"""
Load a file into a Pandas DataFrame based on its extension.
This function reads a CSV or TSV file into a Pandas DataFrame based on the file extension.
Parameters
----------
filename (str): The path to the file to load.
Returns
-------
pd.DataFrame or None: The loaded DataFrame if successful, or None if the file extension is not supported.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 23, 2024
# Determine the file extension
_, file_extension = os.path.splitext(filename)
if file_extension.lower() == '.csv':
df = pd.read_csv(filename)
elif file_extension.lower() == '.tsv':
df = pd.read_csv(filename, delimiter='\t')
else:
return None
return df
[docs]
def find_integer_in_string(text):
# Find all sequences of digits in the text
match = re.search(r'\d+', text)
if match:
# Convert the found string to an integer
return int(match.group(0))
else:
# Return None if no integer is found
return None
[docs]
def delete_dirs_without_log(agent):
directory = agent.state['config'].get('log_path')
# List only first-level subdirectories
for subdir in os.listdir(directory):
subdir_path = os.path.join(directory, subdir)
# Check if it's a directory
if os.path.isdir(subdir_path):
log_file_path = os.path.join(subdir_path, 'log.json')
# If log.json does not exist in the subdirectory, delete the subdirectory
if not os.path.exists(log_file_path):
shutil.rmtree(subdir_path) # Recursively delete directory and its contents
print(f"Deleted directory: {subdir_path}")
[docs]
def strip_root_path(url, root_path):
"""Strips the root path from a URL."""
parsed_url = urlparse(url)
parsed_root = urlparse(root_path)
if parsed_url.path.startswith(parsed_root.path):
return parsed_url.geturl().replace(parsed_root.path, '', 1)
else:
return url