Spaces:

JulsdL
/

AI-Notebook-Tutor

Sleeping

App Files Files Community

JulsdL commited on May 13, 2024

Commit

c4eb0c2

1 Parent(s): 881300d

Refactored the code for better maintainability in preparation for LangGraph multi-agent implementation

Browse files

Files changed (10) hide show

CHANGELOG.md +10 -0
README.md +2 -2
aims_tutor/__init__.py +0 -0
aims_tutor/app.py +11 -0
aims_tutor/chainlit_frontend.py +51 -0
aims_tutor/document_processing.py +95 -0
aims_tutor/prompt_templates.py +32 -0
aims_tutor/retrieval.py +44 -0
aims_tutor/utils.py +5 -0
app.py → main.py +4 -19

CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,13 @@
 version 0.1.0 [2024-05-13]
 ## Added

+version 0.1.1 [2024-05-13]
+## Modified
+- Modularization: The code has been broken down into several modules, each with a specific responsibility. This makes the code easier to understand, test, and maintain. For example, the DocumentManager class in document_processing.py is responsible for managing documents and retrieving information from them. Similarly, the RetrievalManager class in retrieval.py is responsible for processing questions using a retrieval-augmented QA chain and returning the response.
+- Separation of Concerns: The frontend and backend logic have been separated into different files (chainlit_frontend.py and document_processing.py, retrieval.py, etc.), which makes the codebase easier to navigate and maintain.
+- Encapsulation: The code now makes use of classes and methods to encapsulate related functionality. For instance, the DocumentManager class encapsulates the functionality related to document management, and the RetrievalManager class encapsulates the functionality related to question processing and response retrieval.
 version 0.1.0 [2024-05-13]
 ## Added

README.md CHANGED Viewed

@@ -28,9 +28,9 @@ OPENAI_API_KEY=your-key-here
 4. Run the application using the following command:
 ```bash
-chainlit run app.py
 ```
 ## Usage
-Start a chat session and upload a Jupyter notebook file. The application will process the document and you can then ask questions related to the content of the notebook. It might take some time to answer some question, so please be patient.

 4. Run the application using the following command:
 ```bash
+chainlit run aims_tutor/app.py
 ```
 ## Usage
+Start a chat session and upload a Jupyter notebook file. The application will process the document and you can then ask questions related to the content of the notebook. It might take some time to answer some question (should be less than 1 min), so please be patient.

aims_tutor/__init__.py ADDED Viewed

File without changes

aims_tutor/app.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os
+from dotenv import load_dotenv
+import aims_tutor.chainlit_frontend as cl_frontend
+# Load environment variables
+load_dotenv()
+# Main entry point
+if __name__ == "__main__":
+    cl_frontend.start_chat()
+    cl_frontend.handle_user_query()

aims_tutor/chainlit_frontend.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import chainlit as cl
+from dotenv import load_dotenv
+from document_processing import DocumentManager
+from retrieval import RetrievalManager
+# Load environment variables
+load_dotenv()
+@cl.on_chat_start
+async def start_chat():
+    settings = {
+        "model": "gpt-3.5-turbo",
+        "temperature": 0,
+        "top_p": 1,
+        "frequency_penalty": 0,
+        "presence_penalty": 0,
+    }
+    cl.user_session.set("settings", settings)
+    welcome_message = "Welcome to the AIMS-Tutor! Please upload a Jupyter notebook (.ipynb and max. 5mb) to start."
+    await cl.Message(content=welcome_message).send()
+    files = None
+    while files is None:
+        files = await cl.AskFileMessage(
+            content="Please upload a Jupyter notebook (.ipynb, max. 5mb):",
+            accept={"application/x-ipynb+json": [".ipynb"]},
+            max_size_mb=5
+        ).send()
+    file = files[0]  # Get the first file
+    if file:
+        notebook_path = file.path
+        doc_manager = DocumentManager(notebook_path)
+        doc_manager.load_document()
+        doc_manager.initialize_retriever()
+        cl.user_session.set("docs", doc_manager.get_documents())
+        cl.user_session.set("retrieval_manager", RetrievalManager(doc_manager.get_retriever()))
+@cl.on_message
+async def main(message: cl.Message):
+    # Retrieve the multi-query retriever from session
+    retrieval_manager = cl.user_session.get("retrieval_manager")
+    if not retrieval_manager:
+        await cl.Message(content="No document processing setup found. Please upload a Jupyter notebook first.").send()
+        return
+    question = message.content
+    response = retrieval_manager.notebook_QA(question)  # Process the question
+    msg = cl.Message(content=response)
+    await msg.send()

aims_tutor/document_processing.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+from langchain_community.document_loaders import NotebookLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Qdrant
+from langchain.retrievers import MultiQueryRetriever
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain_openai import ChatOpenAI
+from dotenv import load_dotenv
+from aims_tutor.utils import tiktoken_len
+# Load environment variables
+load_dotenv()
+# Configuration for OpenAI
+OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
+openai_chat_model = ChatOpenAI(model="gpt-4-turbo", temperature=0.1)
+class DocumentManager:
+    """
+    A class for managing documents and retrieving information from them.
+    Attributes:
+        notebook_path (str): The path to the notebook file.
+        docs (list): A list of loaded documents.
+        retriever (object): The retriever object used for document retrieval.
+    Methods:
+        load_document(): Loads the documents from the notebook file.
+        initialize_retriever(): Initializes the retriever object for document retrieval.
+        get_retriever(): Returns the retriever object.
+        get_documents(): Returns the loaded documents.
+    """
+    def __init__(self, notebook_path):
+        self.notebook_path = notebook_path
+        self.docs = None
+        self.retriever = None
+    def load_document(self):
+        """
+        Loads the documents from the notebook file.
+        This method initializes a `NotebookLoader` object with the specified parameters and uses it to load the documents from the notebook file. The loaded documents are stored in the `docs` attribute of the `DocumentManager` instance.
+        Parameters:
+            None
+        Returns:
+            None
+        Raises:
+            None
+        """
+        loader = NotebookLoader(
+            self.notebook_path,
+            include_outputs=False,
+            max_output_length=20,
+            remove_newline=True,
+            traceback=False
+        )
+        self.docs = loader.load()
+    def initialize_retriever(self):
+        """
+        A class for managing documents and retrieving information from them.
+        Attributes:
+            notebook_path (str): The path to the notebook file.
+            docs (list): A list of loaded documents.
+            retriever (object): The retriever object used for document retrieval.
+        Methods:
+            load_document(): Loads the documents from the notebook file.
+            initialize_retriever(): Initializes the retriever object for document retrieval.
+            get_retriever(): Returns the retriever object.
+            get_documents(): Returns the loaded documents.
+        """
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50, length_function=tiktoken_len)
+        split_chunks = text_splitter.split_documents(self.docs)
+        embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
+        qdrant_vectorstore = Qdrant.from_documents(split_chunks, embedding_model, location=":memory:", collection_name="Notebook")
+        qdrant_retriever = qdrant_vectorstore.as_retriever() # Set the Qdrant vector store as a retriever
+        multiquery_retriever = MultiQueryRetriever.from_llm(retriever=qdrant_retriever, llm=openai_chat_model, include_original=True) # Create a multi-query retriever on top of the Qdrant retriever
+        self.retriever = multiquery_retriever
+    def get_retriever(self):
+        return self.retriever
+    def get_documents(self):
+        return self.docs

aims_tutor/prompt_templates.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from langchain_core.prompts import ChatPromptTemplate
+class PromptTemplates:
+    """
+    The PromptTemplates class represents a collection of prompt templates used for generating chat prompts.
+    Attributes:
+        rag_QA_prompt (ChatPromptTemplate): A prompt template for generating RAG QA prompts.
+    Methods:
+        __init__(): Initializes all prompt templates as instance variables.
+        get_rag_qa_prompt(): Returns the RAG QA prompt.
+    Example usage:
+        prompt_templates = PromptTemplates()
+        rag_qa_prompt = prompt_templates.get_rag_qa_prompt()
+    """
+    def __init__(self):
+        # Initializes all prompt templates as instance variables
+        self.rag_QA_prompt = ChatPromptTemplate.from_template("""
+            CONTEXT:
+            {context}
+            QUERY:
+            {question}
+            Answer the query in a pretty format if the context is related to it; otherwise, answer: 'Sorry, I can't answer. Please ask another question.'
+        """)
+    def get_rag_qa_prompt(self):
+        # Returns the RAG QA prompt
+        return self.rag_QA_prompt

aims_tutor/retrieval.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from langchain_core.runnables import RunnablePassthrough
+from langchain_openai import ChatOpenAI
+from operator import itemgetter
+from prompt_templates import PromptTemplates
+class RetrievalManager:
+    """
+    RetrievalManager class.
+    This class represents a retrieval manager that processes questions using a retrieval-augmented QA chain and returns the response.
+    Attributes:
+        retriever (object): The retriever object used for retrieval.
+        chat_model (object): The ChatOpenAI object representing the OpenAI Chat model.
+    Methods:
+        notebook_QA(question):
+            Processes a question using the retrieval-augmented QA chain and returns the response.
+    """
+    def __init__(self, retriever):
+        self.retriever = retriever
+        self.chat_model = ChatOpenAI(model="gpt-4-turbo", temperature=0.1)
+        self.prompts = PromptTemplates()
+    def notebook_QA(self, question):
+        """
+        Processes a question using the retrieval-augmented QA chain and returns the response.
+        Parameters:
+            question (str): The question to be processed.
+        Returns:
+            str: The response generated by the retrieval-augmented QA chain.
+        """
+        retrieval_augmented_qa_chain = (
+            {"context": itemgetter("question") | self.retriever, "question": itemgetter("question")}
+            | RunnablePassthrough.assign(context=itemgetter("context"))
+            | {"response": self.prompts.get_rag_qa_prompt() | self.chat_model, "context": itemgetter("context")}
+        )
+        response = retrieval_augmented_qa_chain.invoke({"question": question})
+        return response["response"].content

aims_tutor/utils.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import tiktoken
+def tiktoken_len(text):
+    tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(text)
+    return len(tokens)

app.py → main.py RENAMED Viewed

@@ -20,7 +20,7 @@ load_dotenv()
 # Configuration for OpenAI
 OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
-openai_chat_model = ChatOpenAI(model="gpt-4-turbo", temperature=0)
 # Define the RAG prompt
 RAG_PROMPT = """
@@ -68,7 +68,7 @@ async def start_chat():
         loader = NotebookLoader(
             notebook_path,
-            include_outputs=True,
             max_output_length=20,
             remove_newline=True,
             traceback=False
@@ -82,33 +82,18 @@ async def start_chat():
         embedding_model = OpenAIEmbeddings(model="text-embedding-3-small") # Initialize the embedding model
         qdrant_vectorstore = Qdrant.from_documents(split_chunks, embedding_model, location=":memory:", collection_name="Notebook") # Create a Qdrant vector store
         qdrant_retriever = qdrant_vectorstore.as_retriever() # Set the Qdrant vector store as a retriever
-        multiquery_retriever = MultiQueryRetriever.from_llm(retriever=qdrant_retriever, llm=openai_chat_model) # Create a multi-query retriever on top of the Qdrant retriever
         # Store the multiquery_retriever in the user session
         cl.user_session.set("multiquery_retriever", multiquery_retriever)
-@cl.on_message
-async def main(message: cl.Message):
-    # Retrieve the multi-query retriever from session
-    multiquery_retriever = cl.user_session.get("multiquery_retriever")
-    if not multiquery_retriever:
-        await message.reply("No document processing chain found. Please upload a Jupyter notebook first.")
-        return
-    question = message.content
-    response = handle_query(question, multiquery_retriever)  # Process the question
-    msg = cl.Message(content=response)
-    await msg.send()
 @cl.on_message
 async def main(message: cl.Message):
     # Retrieve the multi-query retriever from session
     multiquery_retriever = cl.user_session.get("multiquery_retriever")
     if not multiquery_retriever:
-        await message.reply("No document processing setup found. Please upload a Jupyter notebook first.")
         return
     question = message.content

 # Configuration for OpenAI
 OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
+openai_chat_model = ChatOpenAI(model="gpt-4-turbo", temperature=0.1)
 # Define the RAG prompt
 RAG_PROMPT = """
         loader = NotebookLoader(
             notebook_path,
+            include_outputs=False,
             max_output_length=20,
             remove_newline=True,
             traceback=False
         embedding_model = OpenAIEmbeddings(model="text-embedding-3-small") # Initialize the embedding model
         qdrant_vectorstore = Qdrant.from_documents(split_chunks, embedding_model, location=":memory:", collection_name="Notebook") # Create a Qdrant vector store
         qdrant_retriever = qdrant_vectorstore.as_retriever() # Set the Qdrant vector store as a retriever
+        multiquery_retriever = MultiQueryRetriever.from_llm(retriever=qdrant_retriever, llm=openai_chat_model, include_original=True) # Create a multi-query retriever on top of the Qdrant retriever
         # Store the multiquery_retriever in the user session
         cl.user_session.set("multiquery_retriever", multiquery_retriever)
 @cl.on_message
 async def main(message: cl.Message):
     # Retrieve the multi-query retriever from session
     multiquery_retriever = cl.user_session.get("multiquery_retriever")
     if not multiquery_retriever:
+        await cl.Message(content="No document processing setup found. Please upload a Jupyter notebook first.").send()
         return
     question = message.content