Spaces:

Vira21
/

Legal_AI_Risk_Management

Runtime error

App Files Files Community

Vira21 commited on Jan 3

Commit

ce15bd8

verified ·

1 Parent(s): c4dbc72

Upload 15 files

Browse files

Files changed (15) hide show

Dockerfile +11 -0
app.py +247 -0
chainlit.md +14 -0
myutils/__pycache__/finetuning.cpython-311.pyc +0 -0
myutils/__pycache__/finetuning.cpython-312.pyc +0 -0
myutils/__pycache__/prepare_data_for_finetuning.cpython-311.pyc +0 -0
myutils/__pycache__/rag_pipeline_utils.cpython-311.pyc +0 -0
myutils/__pycache__/rag_pipeline_utils.cpython-312.pyc +0 -0
myutils/__pycache__/ragas_pipeline.cpython-311.pyc +0 -0
myutils/__pycache__/ragas_pipeline.cpython-312.pyc +0 -0
myutils/finetuning.py +410 -0
myutils/pdfloader.py +87 -0
myutils/rag_pipeline_utils.py +289 -0
myutils/ragas_pipeline.py +86 -0
requirements.txt +17 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.11
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+COPY ./requirements.txt ~/app/requirements.txt
+RUN pip install -r requirements.txt
+COPY . .
+CMD ["chainlit", "run", "app.py", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+app_end_to_end_prototype.py
+1. This app loads two pdf documents and allows the user to ask questions about these documents.
+    The documents that are used are:
+    https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf
+    AND
+    https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf
+2. The two documents are pre-processed on start.  Here are brief details on the pre-processing:
+    a.  text is split into chunks using langchain RecursiveCharacterTextSplitter method.
+    b.  The text in each chunk is converted to an embedding using OpenAI text-embedding-3-small embeddings.
+        Each embedding produced by this model has dimension 1536.
+        Each chunk is therefore represented by an embedding of dimension 1536.
+    c.  The collection of embeddings for all chunks along with metadata are saved/indexed in a vector database.
+    d.  For this exercise, I use an in-memory version of Qdrant vector db.
+3.  The next step is to build a RAG pipeline to answer questions.  This is implemented as follows:
+    a.  I use a simple prompt that retrieves relevant contexts based on a user query.
+    b.  First, the user query is encoded using the same embedding model as the documents.
+    c.  Second, a set of relevant documents is returned by the retriever
+        which efficiently searches the vector db and returns the most relevant chunks.
+    d.  Third, the user query and retrieved contexts are then passed to a chat-enabled LLM.
+        I use OpenAI's gpt-4o-mini throughout this exercise.
+    e.  Fourth, the chat model processes the user query and context along with the prompt and
+        generates a response that is then passed to the user.
+4.  The cl.on_start initiates the conversation with the user.
+5.  The cl.on_message decorator wraps the main function
+        This function does the following:
+            a. receives the query that the user types in
+            b. runs the RAG pipeline
+            c. sends results back to UI for display
+Additional Notes:
+a. note the use of async functions and await async syntax throughout the module here!
+b. note the use of yield rather than return in certain key functions
+c. note the use of streaming capabilities when needed
+"""
+import os
+from typing import List
+from dotenv import load_dotenv
+# chainlit imports
+import chainlit as cl
+# langchain imports
+# document loader
+from langchain_community.document_loaders import PyMuPDFLoader
+# text splitter
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+# embeddings model to embed each chunk of text in doc
+from langchain_openai import OpenAIEmbeddings
+# vector store
+# llm for text generation using prompt plus retrieved context plus query
+from langchain_openai import ChatOpenAI
+# templates to create custom prompts
+from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
+# chains
+# LCEL Runnable Passthrough
+from langchain_core.runnables import RunnablePassthrough
+# to parse output from llm
+from langchain_core.output_parsers import StrOutputParser
+from langchain.docstore.document import Document
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain.document_loaders import PyMuPDFLoader
+from sentence_transformers import SentenceTransformer
+from myutils.rag_pipeline_utils import SimpleTextSplitter, SemanticTextSplitter, VectorStore, AdvancedRetriever
+from myutils.ragas_pipeline import RagasPipeline
+from myutils.rag_pipeline_utils import load_all_pdfs, set_up_rag_pipeline
+load_dotenv()
+# Flag to indicate if pdfs should be loaded directly from URLs
+# If True, get pdfs from urls; if false, get them from local copy
+LOAD_PDF_DIRECTLY_FROM_URL = True
+# set the APP_MODE
+# one of two choices:
+# early_prototype means use OpenAI embeddings
+# advanced_prototype means use finetuned model embeddings
+APP_MODE = "early_prototype"
+if APP_MODE == "advanced_prototype":
+    embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
+    embed_dim = 1536
+    appendix_to_user_message = "This chatbot is built using OpenAI Embeddings as a fast prototype."
+else:
+    finetuned_model_id = "Vira21/finetuned_arctic"
+    arctic_finetuned_model = SentenceTransformer(finetuned_model_id)
+    embeddings = HuggingFaceEmbeddings(model_name="Vira21/finetuned_arctic")
+    appendix_to_user_message = "Our Tech team finetuned snowflake-arctic-embed-m to bring you this chatbot!!"
+    embed_dim = 768
+rag_template = """
+You are an assistant for question-answering tasks.
+You will be given documents on the risks of AI, frameworks and
+policies formulated by various governmental agencies to articulate
+these risks and to safeguard against these risks.
+Use the following pieces of retrieved context to answer
+the question.
+You must answer the question only based on the context provided.
+If you don't know the answer or if the context does not provide sufficient information,
+then say that you don't know.
+If the user expresses gratitude or types a greeting, respond respectfully instead of saying
+"I don't know." Acknowledge their message and kindly ask if they have any questions related to AI risks.
+Think through your answer step-by-step.
+Context:
+{context}
+Question:
+{question}
+"""
+rag_prompt = ChatPromptTemplate.from_template(template=rag_template)
+# parameters to manage text splitting/chunking
+chunk_kwargs = {
+    'chunk_size': 1000,
+    'chunk_overlap': 300
+}
+retrieval_chain_kwargs = {
+    'location': ":memory:",
+    'collection_name': 'End_to_End_Prototype',
+    'embeddings': embeddings,
+    'embed_dim': embed_dim,
+    'prompt': rag_prompt,
+    'qa_llm': ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+}
+urls_for_pdfs = [
+    "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf",
+    "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"
+]
+pdf_file_paths = [
+    './data/docs_for_rag/Blueprint-for-an-AI-Bill-of-Rights.pdf',
+    './data/docs_for_rag/NIST.AI.600-1.pdf'
+]
+# if flag is True, then pass in pointers to URLs
+# if flag is false, then pass in file pointers
+if LOAD_PDF_DIRECTLY_FROM_URL:
+    docpathlist = urls_for_pdfs
+else:
+    docpathlist = pdf_file_paths
+class RetrievalAugmentedQAPipelineWithLangchain:
+    def __init__(self,
+                 list_of_documents,
+                 chunk_kwargs,
+                 retrieval_chain_kwargs):
+        self.list_of_documents = list_of_documents
+        self.chunk_kwargs = chunk_kwargs
+        self.retrieval_chain_kwargs = retrieval_chain_kwargs
+        self.load_documents()
+        self.split_text()
+        self.set_up_rag_pipeline()
+        return
+    def load_documents(self):
+        self.documents = load_all_pdfs(self.list_of_documents)
+        return self
+    def split_text(self):
+        baseline_text_splitter = \
+            SimpleTextSplitter(**self.chunk_kwargs, documents=self.documents)
+        # split text for baseline case
+        self.baseline_text_splits = baseline_text_splitter.split_text()
+        return self
+    def set_up_rag_pipeline(self):
+        self.retrieval_chain = set_up_rag_pipeline(
+            **self.retrieval_chain_kwargs,
+            text_splits=self.baseline_text_splits
+        )
+        return self
+RETRIEVAL_CHAIN = \
+    RetrievalAugmentedQAPipelineWithLangchain(
+        list_of_documents=docpathlist,
+        chunk_kwargs=chunk_kwargs,
+        retrieval_chain_kwargs=retrieval_chain_kwargs
+    ).retrieval_chain
+@cl.set_starters
+async def set_starters():
+    return [
+        cl.Starter(
+            label="AI Bill of Rights",
+            message="What are the key principles outlined in the Blueprint for an AI Bill of Rights?",
+            description="Learn about the fundamental rights and protections proposed in the AI Bill of Rights",
+        ),
+        cl.Starter(
+            label="AI Risk Assessment",
+            message="What are the main risks and challenges identified in the NIST AI Risk Management Framework?",
+            description="Understand key AI risks and mitigation strategies",
+        ),
+        cl.Starter(
+            label="Data Privacy Protection",
+            message="How do these documents address data privacy and protection in AI systems?",
+            description="Explore guidelines for protecting personal data in AI applications",
+        ),
+        cl.Starter(
+            label="AI System Testing",
+            message="What are the recommended approaches for testing and validating AI systems for safety and reliability?",
+            description="Learn about AI system validation and testing requirements",
+        ),
+    ]
+@cl.on_chat_start
+async def on_chat_start():
+    # Initialize the retrieval chain without sending a welcome message
+    cl.user_session.set("retrieval_chain", RETRIEVAL_CHAIN)
+@cl.on_message
+async def main(message):
+    retrieval_chain = cl.user_session.get("retrieval_chain")
+    msg = cl.Message(content="")
+    # result = await raqa_chain.invoke({"input": message.content})
+    result = await cl.make_async(retrieval_chain.invoke)({"question": message.content})
+    # async for stream_resp in result["answer"]:
+    for stream_resp in result["response"].content:
+        await msg.stream_token(stream_resp)
+    await msg.send()

chainlit.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Welcome to Chainlit! 🚀🤖
+Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
+## Useful Links 🔗
+- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
+- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
+We can't wait to see what you create with Chainlit! Happy coding! 💻😊
+## Welcome screen
+To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.

myutils/__pycache__/finetuning.cpython-311.pyc ADDED Viewed

Binary file (21 kB). View file

myutils/__pycache__/finetuning.cpython-312.pyc ADDED Viewed

Binary file (19.2 kB). View file

myutils/__pycache__/prepare_data_for_finetuning.cpython-311.pyc ADDED Viewed

Binary file (14.7 kB). View file

myutils/__pycache__/rag_pipeline_utils.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

myutils/__pycache__/rag_pipeline_utils.cpython-312.pyc ADDED Viewed

Binary file (11.1 kB). View file

myutils/__pycache__/ragas_pipeline.cpython-311.pyc ADDED Viewed

Binary file (3.91 kB). View file

myutils/__pycache__/ragas_pipeline.cpython-312.pyc ADDED Viewed

Binary file (3.35 kB). View file

myutils/finetuning.py ADDED Viewed

	@@ -0,0 +1,410 @@

+"""
+finetuning_pipeline.py
+Collects a number of methods in classes to streamline the finetuning of model embeddings
+#### Fine-tuning Steps
+1.  Prepare Train, Val and Test Data
+    -   if needed, chunk data to get a list of LC Documents
+    -   Split the list into train, val and test sub-groups
+    -   For each sub-group, use an LLM to generate a list of POSITIVE question, context pairs.
+        -   This is done by passing the context to the LLM along with a prompt to generate `n_questions` number of questions; the questions are extracted from the LLM output and paired with the underlying context.  Note that each context will have more than one question paired with it.
+    -   Write out the list of question, context pairs for train, val and test sub-groups into a jsonl file for future reference.
+    -   The train sub-group is loaded into a HF Dataset object for use in training.
+2.  Data Loader
+    -   Set up data loader
+    -   This includes the training data along with batch size information.
+3.  Load model to be finetuned
+    -   Use HF model name to load model
+4.  Set up loss function
+    -   concept of inner loss: MultipleNegativesRankingLoss
+    -   wrap inner loss in overall loss: MatryoshkaLoss
+5.  Set up finetuning pipeline
+    -   This includes data, model, loss and hyperparameters
+    -   Hyperparameters include number of epochs, warmup, etc.
+6.  Run the finetuning pipeline and get modified model embeddings
+    -   save these embeddings
+    -   see if these can be loaded onto HF
+    -   see if these can be downloaded from HF
+7.  Validation Loss
+    -   run assessment on val sub-group
+"""
+# imports
+from operator import itemgetter
+import pandas as pd
+from typing import List
+import uuid
+import random
+import tqdm
+import re
+import json
+import pandas as pd
+from torch.utils.data import DataLoader
+from sentence_transformers import SentenceTransformer
+from sentence_transformers import InputExample
+from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
+from sentence_transformers.evaluation import InformationRetrievalEvaluator
+from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+from langchain_core.documents import Document
+class GenerateQuestionsForContexts:
+    def __init__(self,
+                 qa_chat_model_name="gpt-4o-mini",
+                 n_questions=3):
+        self.qa_chat_model_name = qa_chat_model_name
+        # regex pattern used to extract questions from LLM response
+        # first group is question number - an integer - followed by a period
+        # second group is any character that follows this
+        self.regex_pattern = r'(^\d+).(.+)'
+        self.n_questions = n_questions
+        self.set_up_chat_model()
+        self.set_up_question_generation_chain()
+        return
+    def get_unique_id(self, id_set):
+        """
+        Generate unique id not present in input set of ids
+        Input
+            a set of unique identifiers
+        Returns
+            a new unique id not in input set
+            updated input set of ids incl the newly generated id
+        """
+        id = str(uuid.uuid4())
+        while id in id_set:
+            id = str(uuid.uuid4())
+        id_set.add(id)
+        return id, id_set
+    def set_up_chat_model(self):
+        self.qa_chat_model = ChatOpenAI(
+            model=self.qa_chat_model_name,
+            temperature=0
+        )
+        return self
+    def set_up_question_generation_chain(self):
+        qa_prompt = """\
+        Given the following context, you must generate questions based on only the provided context.
+        You are to generate {n_questions} questions which should be provided in the following format:
+        1. QUESTION #1
+        2. QUESTION #2
+        ...
+        Context:
+        {context}
+        """
+        qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)
+        self.question_generation_chain = qa_prompt_template | self.qa_chat_model
+        return self
+    def create_questions(self, documents, n_questions):
+        questions = {}
+        relevant_docs = {}
+        q_id_set = set()
+        for document in tqdm.tqdm(documents):  # note tqdm.tqdm (NOT just tqdm as in original notebook)
+            this_question_set = \
+                self.question_generation_chain.invoke(
+                    {
+                        'context': document.page_content,
+                        'n_questions': n_questions
+                    }
+                )
+            for question in this_question_set.content.split("\n"):
+                if len(question) > 0:
+                    try:
+                        q_id, q_id_set  = self.get_unique_id(q_id_set)
+                        matched_pattern = re.search(self.regex_pattern, question)  # regex search for n. <question>
+                        if len(matched_pattern.group(2)) > 0:
+                            questions[q_id] = matched_pattern.group(2).strip()  # extraction of question string
+                            relevant_docs[q_id] = [document.metadata["id"]]
+                    except Exception:
+                        continue
+        return questions, relevant_docs
+class PrepareDataForFinetuning(GenerateQuestionsForContexts):
+    def __init__(self,
+                 chunk_size=None, chunk_overlap=None, len_function=None,
+                 lcdocuments=None, run_optional_text_splitter=False,
+                 all_splits=None, train_val_test_size=[10, 5, 5],
+                 train_val_test_split_type='random',
+                 random_seed=69, qa_chat_model_name="gpt-4o-mini",
+                 n_questions=2, batch_size=5):
+        super().__init__(qa_chat_model_name=qa_chat_model_name,
+                         n_questions=n_questions)
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.len_function = len_function
+        self.lcdocuments = lcdocuments
+        self.run_optional_text_splitter = run_optional_text_splitter
+        self.all_doc_splits = all_splits
+        self.train_val_test_size = train_val_test_size
+        self.n_train = self.train_val_test_size[0]
+        self.n_val = self.train_val_test_size[1]
+        self.n_test = self.train_val_test_size[2]
+        self.train_val_test_split_type = train_val_test_split_type
+        self.random_seed = random_seed
+        self.batch_size = batch_size
+        return
+    def optional_text_splitter(self):
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size = self.chunk_size,
+            chunk_overlap  = self.chunk_overlap,
+            length_function = self.len_function
+        )
+        self.all_doc_splits = text_splitter.split_documents(self.lcdocuments.load())
+        return self
+    def attach_unique_ids_to_docs(self):
+        id_set = set()
+        for docsplit in self.all_doc_splits:
+            id, id_set  = self.get_unique_id(id_set)
+            docsplit.metadata["id"] = id
+        return self
+    def simple_train_val_test_splits(self):
+        self.training_splits = self.all_doc_splits[:self.n_train]
+        self.val_splits = self.all_doc_splits[self.n_train:self.n_train+self.n_val]
+        self.test_splits = self.all_doc_splits[self.n_train+self.n_val:]
+        return self
+    def randomized_train_val_test_splits(self):
+        # set the same seed to be able to replicate the result of
+        # random shuffle below
+        random.seed(self.random_seed)
+        # randomly orders the elements in the list training_documents
+        randomly_ordered_documents = self.all_doc_splits.copy()
+        random.shuffle(randomly_ordered_documents)
+        # assign slices to training, val and test
+        self.training_splits = randomly_ordered_documents[:self.n_train]
+        self.val_splits = randomly_ordered_documents[self.n_train: self.n_train+self.n_val]
+        self.test_splits = randomly_ordered_documents[self.n_train+self.n_val:]
+        return self
+    def get_all_questions(self):
+        self.training_questions, self.training_relevant_contexts = \
+            self.create_questions(documents=self.training_splits, n_questions=self.n_questions)
+        self.val_questions, self.val_relevant_contexts = \
+            self.create_questions(documents=self.val_splits, n_questions=self.n_questions)
+        self.test_questions, self.test_relevant_contexts = \
+            self.create_questions(documents=self.test_splits, n_questions=self.n_questions)
+        return self
+    def save_dataset_to_jsonl(self, splits, questions, relevant_contexts, jsonl_filename):
+        """
+        NOTE: Each `jsonl` file has a single line!  This is a nested JSON structure.
+        Primary keys for each file are `questions`, `relevant_contexts` and `corpus`.
+        1.  Each `question` element is a json object with a key id for the
+                question and the string corresp to question as the value.
+        2.  Each `relevant_contexts` element is a json object with key id
+                corresponding to a question id and value corresponding to a unique id for the context
+        3.  Each `corpus` element is a json object with key id
+                corresponding to a unique context id and value being the context string.
+        """
+        corpus = {item.metadata["id"] : item.page_content for item in splits}
+        dataset_dict = {
+            "questions" : questions,
+            "relevant_contexts" : relevant_contexts,
+            "corpus" : corpus
+        }
+        with open(jsonl_filename, "w") as f:
+            json.dump(dataset_dict, f)
+        return dataset_dict
+    def save_train_val_test_dataset_to_jsonl(self):
+        self.train_dataset = \
+            self.save_dataset_to_jsonl(self.training_splits,
+                                       self.training_questions,
+                                       self.training_relevant_contexts,
+                                       jsonl_filename='./data/finetuning_data/training_dataset.jsonl')
+        self.val_dataset = \
+            self.save_dataset_to_jsonl(self.val_splits,
+                                       self.val_questions,
+                                       self.val_relevant_contexts,
+                                       jsonl_filename='./data/finetuning_data/val_dataset.jsonl')
+        self.test_dataset = \
+            self.save_dataset_to_jsonl(self.test_splits,
+                                       self.test_questions,
+                                       self.test_relevant_contexts,
+                                       jsonl_filename='./data/finetuning_data/test_dataset.jsonl')
+        return self
+    def run_all_prep_data(self):
+        # if docs are passed in pre-chunking, then split docs
+        if self.run_optional_text_splitter is True:
+            self.optional_text_splitter()
+        # each chunk i.e., context gets a unique id
+        self.attach_unique_ids_to_docs()
+        # split into train, val and test - either random or simple slicing
+        if self.train_val_test_split_type.upper() == 'RANDOM':
+            self.randomized_train_val_test_splits()
+        else:
+            self.simple_train_val_test_splits()
+        # generate questions for each context
+        # this step involves large number of LLM calls
+        self.get_all_questions()
+        # save train, val and test datasets in jsonl format
+        self.save_train_val_test_dataset_to_jsonl()
+        return self
+class FineTuneModel:
+    def __init__(self,
+                 train_data,
+                 val_data,
+                 batch_size,
+                 base_model_id='Snowflake/snowflake-arctic-embed-m',
+                 matryoshka_dimensions=[768, 512, 256, 128, 64],
+                 number_of_training_epochs=5,
+                 finetuned_model_output_path='finetuned_arctic',
+                 evaluation_steps = 50):
+        self.train_data = train_data
+        self.val_data = val_data
+        self.batch_size = batch_size
+        self.base_model_id = base_model_id
+        self.matryoshka_dimensions = matryoshka_dimensions
+        self.number_of_training_epochs = number_of_training_epochs
+        self.finetuned_model_output_path = finetuned_model_output_path
+        self.evaluation_steps = evaluation_steps
+        self.model = SentenceTransformer(self.base_model_id)
+        return
+    def prepare_data_for_finetuning(self, data):
+        corpus = data['corpus']
+        queries = data['questions']
+        relevant_docs = data['relevant_contexts']
+        return corpus, queries, relevant_docs
+    def get_data_loader(self):
+        corpus, queries, relevant_docs = self.prepare_data_for_finetuning(self.train_data)
+        examples = []
+        for query_id, query in queries.items():
+            doc_id = relevant_docs[query_id][0]
+            text = corpus[doc_id]
+            example = InputExample(texts=[query, text])
+            examples.append(example)
+        self.loader = DataLoader(examples, batch_size=self.batch_size)
+        return self
+    def loss_function(self):
+        inner_training_loss = MultipleNegativesRankingLoss(self.model)
+        self.train_loss = MatryoshkaLoss(
+            self.model,
+            inner_training_loss,
+            matryoshka_dims=self.matryoshka_dimensions
+        )
+        return self
+    def get_evaluator_for_val(self):
+        corpus, queries, relevant_docs = self.prepare_data_for_finetuning(self.val_data)
+        self.evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)
+        return self
+    def fit_model(self):
+        warmup_steps = int(len(self.loader) * self.number_of_training_epochs * 0.1)
+        self.model.fit(
+            train_objectives=[(self.loader, self.train_loss)],
+            epochs=self.number_of_training_epochs,
+            warmup_steps=warmup_steps,
+            output_path=self.finetuned_model_output_path,
+            show_progress_bar=True,
+            evaluator=self.evaluator,
+            evaluation_steps=self.evaluation_steps,
+        )
+    def run_steps_to_finetune_model(self):
+        # load train data into Loader
+        self.get_data_loader()
+        # set up loss function
+        self.loss_function()
+        # set up evaluator with val data
+        self.get_evaluator_for_val()
+        # finetune the model
+        self.fit_model()
+        return self
+class FineTuneModelAndEvaluateRetriever(FineTuneModel):
+    def __init__(self,
+                 train_data,
+                 val_data,
+                 test_data,
+                 batch_size,
+                 base_model_id='Snowflake/snowflake-arctic-embed-m',
+                 matryoshka_dimensions=[768, 512, 256, 128, 64],
+                 number_of_training_epochs=5,
+                 finetuned_model_output_path='finetuned_arctic',
+                 evaluation_steps = 50,
+                 ):
+        super().__init__(train_data=train_data,
+                         val_data=val_data,
+                         batch_size=batch_size,
+                         base_model_id=base_model_id,
+                         matryoshka_dimensions=matryoshka_dimensions,
+                         number_of_training_epochs=number_of_training_epochs,
+                         finetuned_model_output_path=finetuned_model_output_path,
+                         evaluation_steps = evaluation_steps)
+        self.test_data = test_data
+        return
+    def set_up_test_data_for_retrieval(self, embedding_model_for_retrieval, top_k_for_retrieval):
+        corpus, questions, relevant_docs = self.prepare_data_for_finetuning(self.test_data)
+        documents = [Document(page_content=content, metadata={"id": doc_id})
+                     for doc_id, content in corpus.items()]
+        vectorstore = FAISS.from_documents(documents, embedding_model_for_retrieval)
+        retriever = vectorstore.as_retriever(search_kwargs={"k": top_k_for_retrieval})
+        return corpus, questions, relevant_docs, retriever
+    def evaluate_embeddings_model(self, embedding_model_for_retrieval, top_k_for_retrieval, verbose=False):
+        corpus, questions, relevant_docs, retriever = \
+            self.set_up_test_data_for_retrieval(embedding_model_for_retrieval, top_k_for_retrieval)
+        eval_results = []
+        for id, question in tqdm.tqdm(questions.items()):
+            retrieved_nodes = retriever.invoke(question)
+            retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
+            expected_id = relevant_docs[id][0]
+            is_hit = expected_id in retrieved_ids
+            eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})
+        return eval_results

myutils/pdfloader.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+pdfloader.py
+This class loads a list of pdf documents passed in
+and returns a list of parsed text for these docs
+User can provide one of a few options to load pdf...
+pypdf or pymupdf
+"""
+# importing required classes
+import os
+from typing import List
+from pypdf import PdfReader
+import pymupdf
+VALID_PDF_MODULES = ['pypdf', 'pymupdf']
+class TextFromPdf:
+    '''
+    this class converts a list of pdf documents into a list of text documents
+    '''
+    def __init__(self,
+                 pdfmodule: str,
+                 list_of_pdf_docs: List[str]):
+        # validate pdfmodule
+        if pdfmodule in VALID_PDF_MODULES:
+            self.pdfmodule = pdfmodule
+        else:
+            print(f'ERROR: pdfmodule must be one of {VALID_PDF_MODULES}')
+            raise Exception
+        # validate input list
+        if isinstance(list_of_pdf_docs, list) and len(list_of_pdf_docs) > 0:
+            self.list_of_pdf_docs = list_of_pdf_docs
+        else:
+            print('ERROR: expecting a non-empty list of pdf names to be passed in')
+            raise Exception
+        return
+    def process_single_pdf_with_pypdf(self, pdfdoc):
+        # check if file exists; if not return None
+        if os.path.isfile(pdfdoc):
+            pass
+        else:
+            print(f'Warning: pdf file {pdfdoc} does not exist...skipping to next pdf file')
+            return None
+        reader = PdfReader(pdfdoc)
+        numpages = len(reader.pages)
+        thistext = ''
+        for pagecount in range(0, numpages):
+            page = reader.pages[pagecount]
+            pagetext = page.extract_text()
+            thistext = thistext + '\n ' + pagetext  # adding a line break
+            # print('\n')
+            # print(thistext)
+        return thistext
+    def process_single_pdf_with_pymupdf(self, pdfdoc):
+        # check if file exists; if not return None
+        if os.path.isfile(pdfdoc):
+            pass
+        else:
+            print(f'Warning: pdf file {pdfdoc} does not exist...skipping to next pdf file')
+            return None
+        doc = pymupdf.open(pdfdoc) # open a document
+        thistext = ''
+        for page in doc:
+            pagetext = page.get_text() # get plain text (is in UTF-8)
+            thistext = thistext + '\n ' + pagetext  # adding a line break
+            # print('\n')
+            # print(thistext)
+        return thistext
+    def process_all_pdfs(self):
+        list_of_texts = []
+        for pdfdoc in self.list_of_pdf_docs:
+            pdftext = self.process_single_pdf(pdfdoc)
+            if pdftext is not None:
+                list_of_texts.append([pdftext])
+        return list_of_texts

myutils/rag_pipeline_utils.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""
+rag_pipeline_utils.py
+This python script implements various classes useful for a RAG pipeline.
+Currently I have implemented:
+   Text splitting
+      SimpleTextSplitter: uses RecursiveTextSplitter
+      SemanticTextSplitter: uses SemanticChunker (different threshold types can be used)
+   VectorStore
+      currently only sets up Qdrant vector store in memory
+   AdvancedRetriever
+      simple retriever is a special case -
+      advanced retriever - currently implemented MultiQueryRetriever
+"""
+from operator import itemgetter
+from typing import List
+from langchain_core.runnables import RunnablePassthrough
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain_qdrant import QdrantVectorStore
+from qdrant_client import QdrantClient
+from qdrant_client.http.models import Distance, VectorParams
+from langchain.retrievers.multi_query import MultiQueryRetriever
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_core.documents import Document
+from datasets import Dataset
+from ragas import evaluate
+def load_all_pdfs(list_of_pdf_files: List[str]) -> List[Document]:
+    alldocs = []
+    for pdffile in list_of_pdf_files:
+        thisdoc = PyMuPDFLoader(file_path=pdffile).load()
+        print(f'loaded {pdffile} with {len(thisdoc)} pages ')
+        alldocs.extend(thisdoc)
+    print(f'loaded all files: total number of pages: {len(alldocs)} ')
+    return alldocs
+class SimpleTextSplitter:
+    def __init__(self,
+                 chunk_size,
+                 chunk_overlap,
+                 documents):
+       self.chunk_size = chunk_size
+       self.chunk_overlap = chunk_overlap
+       self.documents = documents
+       return
+    def split_text(self):
+       text_splitter = RecursiveCharacterTextSplitter(
+          chunk_size=self.chunk_size,
+          chunk_overlap=self.chunk_overlap
+       )
+       all_splits = text_splitter.split_documents(self.documents)
+       return all_splits
+class SemanticTextSplitter:
+    def __init__(self,
+                 llm_embeddings=OpenAIEmbeddings(),
+                 threshold_type="interquartile",
+                 documents=None):
+       self.llm_embeddings = llm_embeddings
+       self.threshold_type = threshold_type
+       self.documents = documents
+       return
+    def split_text(self):
+       text_splitter = SemanticChunker(
+          embeddings=self.llm_embeddings,
+          breakpoint_threshold_type="interquartile"
+       )
+       print(f'loaded {len(self.documents)} to be split ')
+       all_splits = text_splitter.split_documents(self.documents)
+       print(f'returning docs split into {len(all_splits)} chunks ')
+       return all_splits
+class VectorStore:
+    def __init__(self,
+                 location,
+                 name,
+                 documents,
+                 size,
+                 embedding=OpenAIEmbeddings()):
+       self.location = location
+       self.name = name
+       self.size = size
+       self.documents = documents
+       self.embedding = embedding
+       self.qdrant_client = QdrantClient(self.location)
+       self.qdrant_client.create_collection(
+          collection_name=self.name,
+          vectors_config=VectorParams(size=self.size, distance=Distance.COSINE),
+       )
+       return
+    def set_up_vectorstore(self):
+       self.qdrant_vector_store = QdrantVectorStore(
+          client=self.qdrant_client,
+          collection_name=self.name,
+          embedding=self.embedding
+       )
+       self.qdrant_vector_store.add_documents(self.documents)
+       return self
+class AdvancedRetriever:
+    def __init__(self,
+                 vectorstore):
+        self.vectorstore = vectorstore
+        return
+    def set_up_simple_retriever(self):
+        simple_retriever = self.vectorstore.as_retriever(
+            search_type='similarity',
+            search_kwargs={
+                'k': 5
+            }
+        )
+        return simple_retriever
+    def set_up_multi_query_retriever(self, llm):
+        retriever = self.set_up_simple_retriever()
+        advanced_retriever = MultiQueryRetriever.from_llm(
+            retriever=retriever, llm=llm
+        )
+        return advanced_retriever
+def run_and_eval_rag_pipeline(location, collection_name, embed_dim, text_splits, embeddings,
+                              prompt, qa_llm, metrics, test_df):
+    """
+    Helper function that runs and evaluates different rag pipelines
+        based on different text_splits presented to the pipeline
+    """
+    # vector store
+    vs = VectorStore(location=location,
+                     name=collection_name,
+                     documents=text_splits,
+                     size=embed_dim,
+                     embedding=embeddings)
+    qdvs = vs.set_up_vectorstore().qdrant_vector_store
+    # retriever
+    retriever = AdvancedRetriever(vectorstore=qdvs).set_up_simple_retriever()
+    # q&a chain using LCEL
+    retrieval_chain = (
+        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
+        | RunnablePassthrough.assign(context=itemgetter("context"))
+        | {"response": prompt | qa_llm, "context": itemgetter("context")}
+    )
+    # get questions, and ground-truth
+    test_questions = test_df["question"].values.tolist()
+    test_groundtruths = test_df["ground_truth"].values.tolist()
+    # run RAG pipeline
+    answers = []
+    contexts = []
+    for question in test_questions:
+        response = retrieval_chain.invoke({"question" : question})
+        answers.append(response["response"].content)
+        contexts.append([context.page_content for context in response["context"]])
+    # Save RAG pipeline results to HF Dataset object
+    response_dataset = Dataset.from_dict({
+        "question" : test_questions,
+        "answer" : answers,
+        "contexts" : contexts,
+        "ground_truth" : test_groundtruths
+    })
+    # Run RAGAS Evaluation - using metrics
+    results = evaluate(response_dataset, metrics)
+    # save results to df
+    results_df = results.to_pandas()
+    return results, results_df
+def set_up_rag_pipeline(location, collection_name,
+                        embeddings, embed_dim,
+                        prompt, qa_llm,
+                        text_splits,):
+    """
+    Helper function that sets up a RAG pipeline
+    Inputs
+        location:           memory or persistent store
+        collection_name:    name of collection, string
+        embeddings:         object referring to embeddings to be used
+        embed_dim:          embedding dimension
+        prompt:             prompt used in RAG pipeline
+        qa_llm:             LLM used to generate response
+        text_splits:        list containing text splits
+    Returns a retrieval chain
+    """
+    # vector store
+    vs = VectorStore(location=location,
+                     name=collection_name,
+                     documents=text_splits,
+                     size=embed_dim,
+                     embedding=embeddings)
+    qdvs = vs.set_up_vectorstore().qdrant_vector_store
+    # retriever
+    retriever = AdvancedRetriever(vectorstore=qdvs).set_up_simple_retriever()
+    # q&a chain using LCEL
+    retrieval_chain = (
+        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
+        | RunnablePassthrough.assign(context=itemgetter("context"))
+        | {"response": prompt | qa_llm, "context": itemgetter("context")}
+    )
+    return retrieval_chain
+def test_rag_pipeline(retrieval_chain, list_of_questions):
+    """
+    Tests RAG pipeline
+    Inputs
+        retrieval_chain:    retrieval chain
+        list_of_questions:  list of questions to use to test RAG pipeline
+    Output
+        List of RAG-pipeline-generated responses to each question
+    """
+    all_answers = []
+    for i, question in enumerate(list_of_questions):
+        response = retrieval_chain.invoke({'question': question})
+        answer = response["response"].content
+        all_answers.append(answer)
+    return all_answers
+def get_vibe_check_on_list_of_questions(collection_name,
+                                        embeddings, embed_dim,
+                                        prompt, llm, text_splits,
+                                        list_of_questions):
+    """
+    HELPER FUNCTION
+    set up retrieval chain for each scenario and print out results
+    of the q_and_a for any list of questions
+    """
+    # set up baseline retriever
+    retrieval_chain = \
+        set_up_rag_pipeline(location=":memory:", collection_name=collection_name,
+                            embeddings=embeddings, embed_dim=embed_dim,
+                            prompt=prompt, qa_llm=llm,
+                            text_splits=text_splits)
+    # run RAG pipeline and get responses
+    answers = test_rag_pipeline(retrieval_chain, list_of_questions)
+    # create question, answer tuples
+    q_and_a = [(x, y) for x, y in zip(list_of_questions, answers)]
+    # print out question/answer pairs to review the performance of the pipeline
+    for i, item in enumerate(q_and_a):
+        print('=================')
+        print(f'=====question number: {i} =============')
+        print(item[0])
+        print(item[1])
+    return retrieval_chain, q_and_a

myutils/ragas_pipeline.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+ragas_pipeline.py
+Implements the core pipeline to generate test set for RAGAS.
+"""
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from ragas.testset.generator import TestsetGenerator
+from ragas import evaluate
+from datasets import Dataset
+from myutils.rag_pipeline_utils import SimpleTextSplitter, SemanticTextSplitter, VectorStore, AdvancedRetriever
+class RagasPipeline:
+    def __init__(self, generator_llm_model, critic_llm_model, embedding_model,
+                 number_of_qa_pairs,
+                 chunk_size, chunk_overlap, documents,
+                 distributions):
+        self.generator_llm = ChatOpenAI(model=generator_llm_model)
+        self.critic_llm = ChatOpenAI(model=critic_llm_model)
+        self.embeddings = OpenAIEmbeddings(model=embedding_model)
+        self.number_of_qa_pairs = number_of_qa_pairs
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.documents = documents
+        self.distributions = distributions
+        self.generator = TestsetGenerator.from_langchain(
+            self.generator_llm,
+            self.critic_llm,
+            self.embeddings
+        )
+        return
+    def generate_testset(self):
+        text_splitter = SimpleTextSplitter(
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+            documents=self.documents
+        )
+        ragas_text_splits = text_splitter.split_text()
+        testset = self.generator.generate_with_langchain_docs(
+            ragas_text_splits,
+            self.number_of_qa_pairs,
+            self.distributions
+        )
+        testset_df = testset.to_pandas()
+        return testset_df
+    def ragas_eval_of_rag_pipeline(self, retrieval_chain, ragas_questions, ragas_groundtruths, ragas_metrics):
+        """
+        Helper function that runs and evaluates different rag pipelines
+            based on RAGAS test questions
+        """
+        # run RAG pipeline on RAGAS synthetic questions
+        answers = []
+        contexts = []
+        for question in ragas_questions:
+            response = retrieval_chain.invoke({"question" : question})
+            answers.append(response["response"].content)
+            contexts.append([context.page_content for context in response["context"]])
+        # Save RAG pipeline results to HF Dataset object
+        response_dataset = Dataset.from_dict({
+            "question" : ragas_questions,
+            "answer" : answers,
+            "contexts" : contexts,
+            "ground_truth" : ragas_groundtruths
+        })
+        # Run RAGAS Evaluation - using metrics
+        results = evaluate(response_dataset, ragas_metrics)
+        # save results to df
+        results_df = results.to_pandas()
+        return results, results_df

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+langchain
+langchain-openai
+langchain_core==0.2.38
+langchain-community
+langchainhub
+langchain-qdrant
+langchain_huggingface
+langchain-text-splitters
+langchain_experimental
+ragas==0.1.16
+openai
+pymupdf
+faiss-cpu
+sentence_transformers
+datasets
+pyarrow==14.0.1
+chainlit