"""
This module implements functions to facilitate Retrieval Augmented Generation (RAG), which combines the strengths of document retrieval and language model generation to enhance the user's experience with rich, contextually relevant responses.
Key Functions
=============
1. queryDocs:
Queries documents based on a user prompt and updates the chat status with the results. This function handles the retrieval of relevant documents from a vector database, applies contextual compression, reranks the documents if required, and invokes the language model to generate a response based on the retrieved documents. It also logs the interaction and displays the sources of the information.
2. create_database:
Constructs a database of papers that can be used by the RAG pipeline in BRAD. This method requires a single directory of papers, books, or other pdf documents. This method should be used directly, outside of and prior to constructing an instance of the `Agent` class. Once a database is constructed, documents can be added or removed, and the database will persist on the local disk so that it only needs to be constructed once.
There are several supporting methods as well.
Available Methods
=================
This module contains the following methods:
"""
import pandas as pd
import numpy as np
import chromadb
import time
import subprocess
import os
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.output_parsers import CommaSeparatedListOutputParser
from semantic_router.layer import RouteLayer
from langchain.chains import ConversationChain
from langchain.prompts import PromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain.docstore.document import Document
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
from sklearn.metrics.pairwise import cosine_similarity
from langchain_text_splitters import CharacterTextSplitter
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_community.callbacks import get_openai_callback
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import logging
from BRAD.promptTemplates import history_chat_template, summarize_document_template, get_default_context
#Extraction
import re
from BRAD import utils
from BRAD import log
from BRAD import justchat
# History:
# 2024-09-22: Changing the chains to return information regarding API usage
# such as tokens, time, etc.
[docs]
def queryDocs(state):
"""
Queries documents based on the user prompt and updates the chat status with the results.
:param state: A dictionary containing the current chat status, including the prompt, LLM, vector database, and memory.
:type state: dict
:raises KeyError: If required keys are not found in the state dictionary.
:raises AttributeError: If methods on the vector database or LLM objects are called incorrectly.
:return: The updated chat status dictionary with the query results.
:rtype: dict
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: May 20, 2024
# Developer Comments:
# -------------------
# This function performs Retrieval Augmented Generation. A separate method
# does the retireival, and this is primarily focused on the generation or
# llm calling
#
# History:
# 2024-07-21: Added a new feature to change the doc.page_content to include
# the source
# 2024-10-16: JP changes made to make the logs of this suitable to the GUI
# Issues:
llm = state['llm'] # get the llm
prompt = state['prompt'] # get the user prompt
vectordb = state['databases']['RAG'] # get the vector database
memory = state['memory'] # get the memory of the model
# query to database
if vectordb is not None:
# solo, mutliquery, similarity, and mmr retrieval
state, docs = retrieval(state)
# rerank the documents according to pagerank algorithm
if state['config']['RAG']['rerank']:
docs = pagerank_rerank(docs, state)
# document enrichment
if state['config']['RAG']['documentEnrichment']:
docs = documentEnrichment(docs, state)
# contextual compression of the documents
if state['config']['RAG']['contextual_compression']:
docs = contextual_compression(docs, state)
for i, doc in enumerate(docs):
source = doc.metadata.get('source')
short_source = os.path.basename(str(source))
pageContent = doc.page_content
addingInRefs = "Source: " + short_source + "\nContent: " + pageContent
doc.page_content = addingInRefs
docs[i] = doc
# build chain
chain = load_qa_chain(llm, chain_type="stuff", verbose = state['config']['debug'])
# invoke the chain
start_time = time.time()
with get_openai_callback() as cb:
response = chain({"input_documents": docs, "question": prompt})
response['metadata'] = {
'content' : response,
'time' : time.time() - start_time,
'call back': {
"Total Tokens": cb.total_tokens,
"Prompt Tokens": cb.prompt_tokens,
"Completion Tokens": cb.completion_tokens,
"Total Cost (USD)": cb.total_cost
}
}
# display output
state = log.userOutput(response['output_text'], state=state)
# display sources
sources = []
source_locations = []
for doc in docs:
source = doc.metadata.get('source')
short_source = os.path.basename(str(source))
sources.append(short_source)
source_locations.append(source)
sources = list(set(sources))
state = log.userOutput("Sources:", state=state)
state = log.userOutput('\n'.join(sources), state=state)
state['process']['sources'] = sources
state['process']['source_locations'] = source_locations
# format outputs for logging
response['input_documents'] = getInputDocumentJSONs(response['input_documents'])
state['output'], ragResponse = response['output_text'], response
state['process']['steps'].append(
log.llmCallLog(
llm=llm,
prompt=str(chain),
input=prompt,
output=response,
parsedOutput=state['output'],
apiInfo=response['metadata']['call back'],
purpose='RAG'
)
)
else:
state = justchat.llm_only(state)
return state
[docs]
def retrieval(state):
"""
Performs retrieval from a vectorized database as the initial stage of the RAG pipeline.
This function handles different types of retrieval including multiquery, similarity search,
and max marginal relevance search.
Args:
state (dict): A dictionary containing the LLM, user prompt, vector database,
and configuration settings for the RAG pipeline.
Returns:
tuple: A tuple containing the updated state and a list of retrieved documents.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: June 16, 2024
# Developer Comments:
# -------------------
# This function performs retrieval from a vectorized database as the initial
# stage of the RAG pipeline. It performs several different types of retrieval
# including multiquery, similarity search, and max marginal relevance search.
#
# History:
# - 2024-06-16: JP initialized the function with similarity search and multiquery
# - 2024-06-26: MC added cut() to remove poorly chunked pieces of text
# - 2024-06-29: MC added max_marginal_relevance_search for retrieval
# - 2024-10-16: JP saved the doc sources and text as strings that can be sent to
# the GUI for display
#
# Issues:
# - The MultiQueryRetriever.from_llm doesn't give control over the number of
# prompts or retrieved documents. Also, I don't think it generates great
# prompts. We could reimplement this ourselves.
llm = state['llm'] # get the llm
prompt = state['prompt'] # get the user prompt
vectordb = state['databases']['RAG'] # get the vector database
memory = state['memory'] # get the memory of the model
start_time = time.time()
if state['config']['RAG']['cut']: # Can we remove this code?
vectordb = cut(state, vectordb)
if not state['config']['RAG']['multiquery']:
# initialize empty lists
docsSimilaritySearch, docsMMR = [], []
if state['config']['RAG']['similarity']:
documentSearch = vectordb.similarity_search_with_relevance_scores(prompt, k=state['config']['RAG']['num_articles_retrieved'])
docsSimilaritySearch, scores = getDocumentSimilarity(documentSearch)
if state['config']['RAG']['mmr']:
docsMMR = vectordb.max_marginal_relevance_search(prompt, k=state['config']['RAG']['num_articles_retrieved'])
docs = docsSimilaritySearch + docsMMR
else:
logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)
retriever = MultiQueryRetriever.from_llm(retriever=vectordb.as_retriever(),
llm=llm
)
docs = retriever.get_relevant_documents(query=prompt)
docsText = []
for doc in docs:
docsText.append({
'source': doc.metadata.get('source'),
'text' : doc.page_content
})
state['process']['steps'].append({
'func' : 'rag.retrieval',
'multiquery' : state['config']['RAG']['multiquery'],
'similarity' : state['config']['RAG']['similarity'],
'mmr' : state['config']['RAG']['mmr'],
'num-docs' : len(docs),
'docs' : str(docs),
'docs-to-gui': docsText,
'time' : time.time() - start_time
})
return state, docs
[docs]
def contextual_compression(docs, state):
"""
Summarizes the content of documents based on a user query, updating the
document search results with these summaries.
Args:
docs (list): A list of documents where each document has an attribute
`page_content` containing the text content of the document.
state (dict): BRAD state used to track debuging
Returns:
list: The modified `documentSearch` list with updated `page_content` for each
document, replaced by their summaries.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: July 5, 2024
template = summarize_document_template()
PROMPT = PromptTemplate(input_variables=["user_query"], template=template)
reducedDocs = []
for i, doc in enumerate(docs):
pageContent = doc.page_content
prompt = PROMPT.format(text=pageContent, user_query=state['prompt'])
# Use LLM
start_time = time.time()
with get_openai_callback() as cb:
res = state['llm'].invoke(input=prompt)
res.response_metadata['time'] = time.time() - start_time
res.response_metadata['call back'] = {
"Total Tokens": cb.total_tokens,
"Prompt Tokens": cb.prompt_tokens,
"Completion Tokens": cb.completion_tokens,
"Total Cost (USD)": cb.total_cost
}
summary = res.content.strip()
# Log LLM
state['process']['steps'].append(
log.llmCallLog(
llm = state['llm'], # what LLM?
prompt = PROMPT, # what prompt template?
input = prompt, # what specific input to the llm or template?
output = res, # what is the full llm output?
parsedOutput = summary, # what is the useful output?
purpose = 'contextual compression',# why did you use an llm
apiInfo = res.response_metadata['call back'],
)
)
# Display debug information
if state['config']['debug']:
log.debugLog('============', state=state)
log.debugLog(pageContent, state=state)
log.debugLog('Summary: ' + summary, state=state)
doc.page_content = summary
docs[i] = doc
return docs
[docs]
def documentEnrichment(docs, state):
"""
Enhances the input list of documents by retrieving additional text chunks
from the same source and page, ensuring no duplicate entries are added.
This function searches through a vector database (vectordb) to find more text
that is related to the documents in the input `docs` list. It retrieves all
document chunks from the same file and page that were found during the first retrieval.
Duplicate text chunks are avoided by maintaining a set of already seen texts.
Args:
docs (list): A list of `langchain_core.documents.base.Document` objects.
Each `Document` contains metadata, including 'source' and 'page',
and `page_content`, which holds the text content.
state (dict): The `Agent.state` containing the RAG database
Returns:
list: A list of enriched `Document` objects. These are constructed from
the additional text chunks found on the same page and source
as the originally retrieved documents. The list will only contain
unique documents to avoid duplication.
Notes:
- This function assumes that the `vectordb` object has a method `get()`
that returns a dictionary with keys: 'metadatas' and 'documents'.
The 'metadatas' key contains metadata for each document chunk,
including its source and page. The 'documents' key contains the text content.
- The function prevents duplication of text chunks by using a `set` to track
previously added texts.
"""
# Auth: Joshua Pickard
# jpic@umich.edu
# Date: October 9, 2024
# Extract the vectordb from the state
vectordb = state['databases']['RAG']
# A list to hold all documents or text chunks found in the same file and page
page_enriched_documents = []
# A set of all text chunks that have been added to page_enriched_documents
# This prevents adding duplicate entries
unique_texts = set()
# Retrieve index dictionary from vectordb containing document metadata and text
# Example structure of indexDictionary:
# dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'uris', 'data'])
# The keys: 'metadatas' and 'documents' must be present
index_dictionary = vectordb.get()
# Iterate over retrieved documents
for retrieved_doc in docs:
# Extract metadata for the current document
source = retrieved_doc.metadata.get('source')
page = retrieved_doc.metadata.get('page')
# Initialize a list to store indices of related documents from the same source and page
related_document_idxs = [
idx for idx, metadata in enumerate(index_dictionary['metadatas'])
if metadata.get('source') == source and metadata.get('page') == page
]
# Extract text chunks from documents found on the same page
doc_text_chunks = [index_dictionary['documents'][i] for i in related_document_idxs]
# Save each text chunk as a new document if it's not already added
for text_chunk in doc_text_chunks:
if text_chunk in unique_texts:
continue
# Add the text chunk to the set of unique texts
unique_texts.add(text_chunk)
# Create a new document with the text chunk and its metadata
new_doc = Document(
page_content=text_chunk,
metadata={
'source': source,
'page': page
}
)
# Append the newly created document to the list
page_enriched_documents.append(new_doc)
return page_enriched_documents
[docs]
def getDocumentSimilarity(documents):
"""
Extracts documents and their similarity scores from a list of document-score pairs.
:param documents: A list of tuples where each tuple contains a document object and its similarity score.
:type documents: list
:return: A tuple containing two elements:
- A list of `langchain_core.documents.base.Document` document objects.
- A `numpy` array of similarity scores.
:rtype: tuple
"""
scores = []
docs = []
for doc in documents:
docs.append(doc[0])
scores.append(doc[1])
return docs, np.array(scores)
# Define a function to get the wordnet POS tag
[docs]
def get_wordnet_pos(word):
"""
.. warning:: This function may be removed in the near future.
Gets the WordNet part of speech (POS) tag for a given word.
:param word: The word for which to retrieve the POS tag.
:type word: str
:return: The WordNet POS tag corresponding to the given word. Defaults to noun if no specific tag is found.
:rtype: str
"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
[docs]
def create_database(docsPath='papers/', dbName='database', dbPath='databases/', HuggingFaceEmbeddingsModel = 'BAAI/bge-base-en-v1.5', chunk_size=[700], chunk_overlap=[200], v=False):
"""
.. note: This funciton is not called by the chatbot. Instead, it is required that the user build the database prior to using the chat.
Create a Chroma database from PDF documents.
Args:
docsPath (str, optional): Path where the document files are located. Default is '/nfs/turbo/umms-indikar/shared/projects/RAG/papers/'.
dbName (str, optional): Name of the database to create. Default is None.
dbPath (str, optional): Path where the database will be saved. Default is '/nfs/turbo/umms-indikar/shared/projects/RAG/databases/'.
HuggingFaceEmbeddingsModel (str, optional): Model name for HuggingFace embeddings. Default is 'BAAI/bge-base-en-v1.5'.
chunk_size (list, optional): List of chunk sizes for splitting documents. Default is [700].
chunk_overlap (list, optional): List of chunk overlaps for splitting documents. Default is [200].
v (bool, optional): Verbose mode. If True, print progress messages. Default is False.
"""
# Handle arguments
# dbPath += dbName
local = os.getcwd() ## Get local dir
os.chdir(local) ## shift the work dir to local dir
print('\nWork Directory: {}'.format(local)) if v else None
# Phase 1 - Load DB
embeddings_model = HuggingFaceEmbeddings(model_name=HuggingFaceEmbeddingsModel)
print("\nDocuments loading from: 'str(docsPath)") if v else None
# text_loader_kwargs={'autodetect_encoding': True}
text_loader_kwargs={}
loader = DirectoryLoader(docsPath,
glob="**/*.pdf",
loader_cls=PyPDFLoader,
# loader_kwargs=text_loader_kwargs,
show_progress=True,
use_multithreading=True)
docs_data = loader.load()
print('\nDocuments loaded...') if v else None
for i in range(len(chunk_size)):
for j in range(len(chunk_overlap)):
text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size[i],
chunk_overlap = chunk_overlap[j],
separators=[" ", ",", "\n", ". "])
data_splits = text_splitter.split_documents(docs_data)
print("Documents split into chunks...") if v else None
print("Initializing Chroma Database...") if v else None
# dbName = "DB_cosine_cSize_%d_cOver_%d" %(chunk_size[i], chunk_overlap[j])
print("dbName reset")
# p2_2 = subprocess.run('mkdir %s/*'%os.path.join(dbPath,dbName), shell=True)
p2_2 = os.makedirs(os.path.join(dbPath, dbName), exist_ok=True)
# print(os.path.join(dbPath, dbName))
# print("subprocess run")
# print("_client_settings set")
# print("Starting database construction")
_client_settings = chromadb.PersistentClient(path=os.path.join(dbPath,dbName))
vectordb = Chroma.from_documents(documents = data_splits,
embedding = embeddings_model,
client = _client_settings,
collection_name = dbName,
collection_metadata = {"hnsw:space": "cosine"})
# print(f"{vectordb=}")
# log.debugLog("Completed Chroma Database: ", display=v)
del text_splitter, data_splits
return vectordb
[docs]
def best_match(prompt, title_list):
"""
Find the best matching title from the list based on cosine similarity with a given prompt.
Parameters:
- prompt (str): The prompt or query to find the best match for.
- title_list (list): A list of titles (strings) to compare against the prompt.
Returns:
- best_title (str): The title from title_list that best matches the prompt.
- best_score (float): The cosine similarity score of the best matching title with the prompt.
"""
# Initialize a sentence transformer model
sentence_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
# Remove duplicate titles from the list
unique_title_list = list(set(title_list))
# Encode the prompt and titles into embeddings
query_embedding = sentence_model.encode(prompt)
passage_embedding = sentence_model.encode(unique_title_list)
# Initialize variables to store the best match title and score
best_title = ""
best_score = 0.0
# Set a threshold score for saving the best match
save_score = 0.50 # Adjust as needed
# Compare cosine similarity between query embedding and each title embedding
for score, title in zip(util.cos_sim(query_embedding, passage_embedding)[0], unique_title_list):
if score > save_score:
save_score = score
save_title = title
log.debugLog(f"The best match is {save_title} with a score of {save_score}", state=state)
return save_title, save_score
#Split into two methods?
[docs]
def get_all_sources(vectordb, prompt, path):
"""
Retrieve sources from the vector database based on a prompt and path, and filter the results according to the prompt.
:param vectordb: The vector database object containing metadata and sources for retrieval.
:type vectordb: object
:param prompt: The prompt or query used to filter the sources retrieved from the vector database.
:type prompt: str
:param path: The path used to filter and clean source file paths, ensuring consistency and relevance in the results.
:type path: str
:return: A tuple containing:
- real_source_list: A list of cleaned and filtered source names that match the given prompt.
- filtered_ids: A list of IDs corresponding to the filtered sources based on the prompt.
:rtype: tuple
"""
prompt = prompt.lower()
# Retrieve metadata from vectordb
metadata_full = vectordb.get()['metadatas']
# Extract source file paths
source_list = [item['source'] for item in metadata_full]
# Clean and filter source paths based on the provided path
real_source_list = [((item.replace(path, '')).removesuffix('.pdf')).lower() for item in source_list]
# Create a dataframe with IDs and cleaned source names
db = pd.DataFrame({'id': vectordb.get()['ids'], 'metadatas': real_source_list})
# Filter dataframe based on the prompt
filtered_df = db[db['metadatas'].apply(lambda x: x in prompt)]
# Extract IDs of filtered sources
filtered_ids = filtered_df['id'].to_list()
return real_source_list, filtered_ids
#Given the prompt, find the title and corresponding score that is the best match
[docs]
def adj_matrix_builder(docs, state):
"""
Build an adjacency matrix based on cosine similarity between a prompt and document content.
:param docs: A list of documents or pages (objects) from which to build the adjacency matrix.
:type docs: list
:param state: A dictionary containing information about the chat status and configuration,
including 'prompt', 'config', and 'num_articles_retrieved'.
:type state: dict
:return: A 2D numpy array representing the adjacency matrix, where each element at position (i, j)
indicates the similarity score between documents i and j.
:rtype: np.ndarray
"""
prompt_scale = 0.5 # Weighting scale for prompt similarity
dimension = len(docs)
adj_matrix = np.zeros([dimension, dimension]) # Initialize adjacency matrix
# Initialize a sentence transformer model
sentence_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
# Create a list to store document content (including prompt)
doc_list = [state['prompt']] + [doc.dict()['page_content'] for doc in docs]
# Encode document content into embeddings
passage_embedding = sentence_model.encode(doc_list)
# Calculate cosine similarities between embeddings
cosine_similarities = cosine_similarity(passage_embedding[:state['config']['RAG']['num_articles_retrieved'] + 1])
# Extract similarity scores between prompt and other documents
prompt_sim = cosine_similarities[0, 1:]
# Adjust cosine similarities based on prompt scale and similarity scores
real_cosine_sim = np.zeros((len(prompt_sim), len(prompt_sim)))
for i in range(len(prompt_sim)):
for j in range(len(prompt_sim)):
real_cosine_sim[i, j] = prompt_scale * cosine_similarities[i, j] + 0.5 * (1 - prompt_scale) * (prompt_sim[i] + prompt_sim[j])
return real_cosine_sim
[docs]
def normalize_adjacency_matrix(A):
"""
Normalize an adjacency matrix by dividing each element by the sum of its column.
:param A: Input adjacency matrix to be normalized, where each element (i, j) represents the weight of
the edge from node i to node j.
:type A: np.ndarray
:return: Normalized adjacency matrix where each element at position (i, j) is divided by the sum of
the j-th column of the original matrix A. This normalization ensures that the columns of the
resulting matrix sum to 1, facilitating interpretation as probabilities or relative weights.
:rtype: np.ndarray
"""
col_sums = A.sum(axis=0) # Calculate sum of each column
normalized_A = A / col_sums[np.newaxis, :] # Normalize each element by its column sum
return normalized_A
#weighted pagerank algorithm
#reranker
#removes repeat chunks in vectordb
[docs]
def remove_repeats(vectordb):
"""
Removes repeated chunks in the provided vector database.
This function identifies duplicate documents in the vector database and removes
the repeated entries, keeping only the last occurrence of each duplicated document.
:param vectordb: The vector database from which repeated documents should be removed.
:type vectordb: An instance of a vector database class with 'get' and 'delete' methods.
:raises KeyError: If the vector database does not contain 'ids' or 'documents' keys.
:return: The updated vector database with duplicate documents removed.
:rtype: An instance of the vector database class.
"""
# Auth: Marc Choi
# machoi@umich.edu
# Date: June 18, 2024
# Fetch document IDs and contents from vector database
df = pd.DataFrame({'id': vectordb.get()['ids'], 'documents': vectordb.get()['documents']})
# Find IDs of documents that have duplicate content
repeated_ids = df[df.duplicated(subset='documents', keep='last')]['id'].tolist()
# Delete duplicate documents if any found
if len(repeated_ids) > 0:
vectordb.delete(repeated_ids)
return vectordb
#experimental - see the relative frequency of periods showing up in a given doc
[docs]
def relative_frequency_of_char(input_string):
"""
Calculate the relative frequency of the dot character ('.') in a given input string.
:param input_string: The input string in which to calculate the relative frequency of the dot character.
:type input_string: str
:return: The relative frequency of the dot character ('.') in the input string, expressed as the ratio
of dot occurrences to the total number of characters. If the input string is empty, it returns
0.0 to indicate no occurrences.
:rtype: float
"""
if not input_string:
return 0.0 # Return 0 if the string is empty
dot_count = input_string.count('\n')
total_characters = len(input_string)
relative_frequency = dot_count / total_characters
return relative_frequency
[docs]
def cut(state, vectordb):
"""
Remove documents from a vector database based on a relative frequency threshold of a specific character.
:param state: A dictionary containing chat status information, including the relative frequency
threshold for the character and other contextual details related to the ongoing
conversation.
:type state: dict
:param vectordb: An object representing the vector database, which provides methods for fetching
and deleting documents based on certain criteria, including relative frequency.
:return: The updated vector database object after removing documents that exceed the specified
relative frequency threshold for the character.
"""
# Fetch document IDs and contents from vector database
df = pd.DataFrame({'id': vectordb.get()['ids'], 'documents': vectordb.get()['documents']})
# Calculate relative frequencies of the specific character ('.') for each document
relfreq = [relative_frequency_of_char(docs) for docs in df['documents']]
# Add relative frequencies as a new column in the dataframe
df['relfreq'] = relfreq
# Determine cutoff value for relative frequency (e.g., 80th percentile)
cutoff = np.percentile(relfreq, 80)
print(f"Cutoff relative frequency: {cutoff}")
# Filter documents based on the cutoff
filtered_df = df[df['relfreq'] > cutoff]
# Get IDs of filtered documents
filtered_ids = filtered_df['id'].tolist()
# Delete filtered documents from the vector database if any are found
if len(filtered_ids) > 0:
vectordb.delete(filtered_ids)
return vectordb