File size: 3,503 Bytes
c4eb0c2
 
 
 
 
 
 
 
48d9af7
c4eb0c2
 
 
 
 
 
ead288d
c4eb0c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ead288d
c4eb0c2
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
from langchain_community.document_loaders import NotebookLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
from langchain.retrievers import MultiQueryRetriever
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from notebook_tutor.utils import tiktoken_len

# Load environment variables
load_dotenv()

# Configuration for OpenAI
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
openai_chat_model = ChatOpenAI(model="gpt-4o", temperature=0.1)

class DocumentManager:
    """
    A class for managing documents and retrieving information from them.

    Attributes:
        notebook_path (str): The path to the notebook file.
        docs (list): A list of loaded documents.
        retriever (object): The retriever object used for document retrieval.

    Methods:
        load_document(): Loads the documents from the notebook file.
        initialize_retriever(): Initializes the retriever object for document retrieval.
        get_retriever(): Returns the retriever object.
        get_documents(): Returns the loaded documents.
    """
    def __init__(self, notebook_path):
        self.notebook_path = notebook_path
        self.docs = None
        self.retriever = None

    def load_document(self):
        """
        Loads the documents from the notebook file.

        This method initializes a `NotebookLoader` object with the specified parameters and uses it to load the documents from the notebook file. The loaded documents are stored in the `docs` attribute of the `DocumentManager` instance.

        Parameters:
            None

        Returns:
            None

        Raises:
            None
        """
        loader = NotebookLoader(
            self.notebook_path,
            include_outputs=False,
            max_output_length=20,
            remove_newline=True,
            traceback=False
        )
        self.docs = loader.load()

    def initialize_retriever(self):
        """
        A class for managing documents and retrieving information from them.

        Attributes:
            notebook_path (str): The path to the notebook file.
            docs (list): A list of loaded documents.
            retriever (object): The retriever object used for document retrieval.

        Methods:
            load_document(): Loads the documents from the notebook file.
            initialize_retriever(): Initializes the retriever object for document retrieval.
            get_retriever(): Returns the retriever object.
            get_documents(): Returns the loaded documents.
        """
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50, length_function=tiktoken_len)

        split_chunks = text_splitter.split_documents(self.docs)

        embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

        qdrant_vectorstore = Qdrant.from_documents(split_chunks, embedding_model, location=":memory:", collection_name="Notebook")

        qdrant_retriever = qdrant_vectorstore.as_retriever()

        multiquery_retriever = MultiQueryRetriever.from_llm(retriever=qdrant_retriever, llm=openai_chat_model, include_original=True) # Create a multi-query retriever on top of the Qdrant retriever

        self.retriever = multiquery_retriever

    def get_retriever(self):
        return self.retriever

    def get_documents(self):
        return self.docs