rchrdgwr commited on
Commit
8aeabcb
·
1 Parent(s): a723692

Update state object

Browse files
Files changed (3) hide show
  1. app.py +6 -4
  2. classes/app_state.py +5 -0
  3. utilities/rag_utilities.py +31 -15
app.py CHANGED
@@ -23,16 +23,18 @@ openai_api_key = os.getenv("OPENAI_API_KEY")
23
 
24
  # Setup our state
25
  state = AppState()
 
 
26
  state.set_document_urls(document_urls)
 
27
  state.set_llm_model("gpt-3.5-turbo")
28
  state.set_embedding_model("text-embedding-3-small")
29
-
 
30
 
31
  # Initialize the OpenAI LLM using LangChain
32
  llm = ChatOpenAI(model=state.llm_model, openai_api_key=openai_api_key)
33
-
34
-
35
-
36
 
37
  qdrant_retriever = create_vector_store(state)
38
 
 
23
 
24
  # Setup our state
25
  state = AppState()
26
+ state.set_debug(False)
27
+
28
  state.set_document_urls(document_urls)
29
+
30
  state.set_llm_model("gpt-3.5-turbo")
31
  state.set_embedding_model("text-embedding-3-small")
32
+ state.set_chunk_size(1000)
33
+ state.set_chunk_overlap(100)
34
 
35
  # Initialize the OpenAI LLM using LangChain
36
  llm = ChatOpenAI(model=state.llm_model, openai_api_key=openai_api_key)
37
+ state.set_main_llm(llm)
 
 
38
 
39
  qdrant_retriever = create_vector_store(state)
40
 
classes/app_state.py CHANGED
@@ -13,6 +13,7 @@ class AppState:
13
  self.titles = []
14
  self.documents = []
15
  self.combined_document_objects = []
 
16
  self.retriever = None
17
 
18
  self.system_template = "You are a helpful assistant"
@@ -56,6 +57,10 @@ class AppState:
56
  self.combined_document_objects = combined_document_objects
57
  def set_retriever(self, retriever):
58
  self.retriever = retriever
 
 
 
 
59
  #
60
  # Method to update the user input
61
  def set_user_input(self, input_text):
 
13
  self.titles = []
14
  self.documents = []
15
  self.combined_document_objects = []
16
+ self.main_llm = None
17
  self.retriever = None
18
 
19
  self.system_template = "You are a helpful assistant"
 
57
  self.combined_document_objects = combined_document_objects
58
  def set_retriever(self, retriever):
59
  self.retriever = retriever
60
+ def set_main_llm(self, main_llm):
61
+ self.main_llm = main_llm
62
+ def set_debug(self, debug):
63
+ self.debug = debug
64
  #
65
  # Method to update the user input
66
  def set_user_input(self, input_text):
utilities/rag_utilities.py CHANGED
@@ -5,10 +5,11 @@ from langchain_community.vectorstores import Qdrant
5
  from langchain_openai.embeddings import OpenAIEmbeddings
6
  import fitz
7
  import io
8
- import tiktoken
9
- import requests
10
  import os
 
 
11
  from utilities.debugger import dprint
 
12
 
13
  def tiktoken_len(text):
14
  tokens = tiktoken.encoding_for_model("gpt-4o").encode(
@@ -55,6 +56,7 @@ def get_documents(state):
55
  "title": title,
56
  "metadata": metadata,
57
  "single_text_document": single_text_document,
 
58
  }
59
  state.add_document(document)
60
  dprint(state, f"Title of Document: {title}")
@@ -64,14 +66,7 @@ def get_documents(state):
64
 
65
  def create_chunked_documents(state):
66
  get_documents(state)
67
- # file_path_1 = "data/Blueprint-for-an-AI-Bill-of-Rights.pdf"
68
- # file_path_2 = "data/NIST.AI.600-1.pdf"
69
- # loader = PyMuPDFLoader(file_path_1)
70
- # documents_1 = loader.load()
71
- # loader = PyMuPDFLoader(file_path_2)
72
- # documents_2 = loader.load()
73
- # print(f"Number of pages in 1: {len(documents_1)}")
74
- # print(f"Number of pages in 2: {len(documents_2)}")
75
 
76
 
77
  text_splitter = RecursiveCharacterTextSplitter(
@@ -80,22 +75,42 @@ def create_chunked_documents(state):
80
  length_function = tiktoken_len,
81
  )
82
  combined_document_objects = []
83
-
84
  dprint(state, "Chunking documents and creating document objects")
85
  for document in state.documents:
86
  dprint(state, f"processing documend: {document['title']}")
87
  text = document["single_text_document"]
88
  dprint(state, text)
89
  title = document["title"]
 
90
  chunks_document = text_splitter.split_text(text)
91
  dprint(state, len(chunks_document))
92
- document_objects = [Document(page_content=chunk, metadata={"source": title, "document_id": "doc1"}) for chunk in chunks_document]
93
- dprint(state, f"Number of chunks for Document: {len(chunks_document)}")
94
- combined_document_objects = combined_document_objects + document_objects
 
 
 
 
 
 
 
 
95
  state.add_combined_document_objects(combined_document_objects)
96
 
97
 
98
- def create_vector_store(state):
 
 
 
 
 
 
 
 
 
 
 
 
99
  create_chunked_documents(state)
100
  embedding_model = OpenAIEmbeddings(model=state.embedding_model)
101
 
@@ -106,4 +121,5 @@ def create_vector_store(state):
106
  )
107
  qdrant_retriever = qdrant_vectorstore.as_retriever()
108
  state.set_retriever(qdrant_retriever)
 
109
  return qdrant_retriever
 
5
  from langchain_openai.embeddings import OpenAIEmbeddings
6
  import fitz
7
  import io
 
 
8
  import os
9
+ import requests
10
+ import tiktoken
11
  from utilities.debugger import dprint
12
+ import uuid
13
 
14
  def tiktoken_len(text):
15
  tokens = tiktoken.encoding_for_model("gpt-4o").encode(
 
56
  "title": title,
57
  "metadata": metadata,
58
  "single_text_document": single_text_document,
59
+ "document_id": str(uuid.uuid4())
60
  }
61
  state.add_document(document)
62
  dprint(state, f"Title of Document: {title}")
 
66
 
67
  def create_chunked_documents(state):
68
  get_documents(state)
69
+
 
 
 
 
 
 
 
70
 
71
 
72
  text_splitter = RecursiveCharacterTextSplitter(
 
75
  length_function = tiktoken_len,
76
  )
77
  combined_document_objects = []
 
78
  dprint(state, "Chunking documents and creating document objects")
79
  for document in state.documents:
80
  dprint(state, f"processing documend: {document['title']}")
81
  text = document["single_text_document"]
82
  dprint(state, text)
83
  title = document["title"]
84
+ document_id = document["document_id"]
85
  chunks_document = text_splitter.split_text(text)
86
  dprint(state, len(chunks_document))
87
+
88
+ for chunk_number, chunk in enumerate(chunks_document, start=1):
89
+ document_objects = Document(
90
+ page_content=chunk,
91
+ metadata={
92
+ "source": title,
93
+ "document_id": document.get("document_id", "default_id"),
94
+ "chunk_number": chunk_number # Add unique chunk number
95
+ }
96
+ )
97
+ combined_document_objects.append(document_objects)
98
  state.add_combined_document_objects(combined_document_objects)
99
 
100
 
101
+ def create_vector_store(state, **kwargs):
102
+ for key, value in kwargs.items():
103
+ if hasattr(state, key):
104
+ setattr(state, key, value)
105
+ else:
106
+ print(f"Warning: {key} is not an attribute of the state object")
107
+
108
+ # Rest of your create_vector_store logic
109
+ print(f"Chunk size after update: {state.chunk_size}")
110
+
111
+
112
+
113
+
114
  create_chunked_documents(state)
115
  embedding_model = OpenAIEmbeddings(model=state.embedding_model)
116
 
 
121
  )
122
  qdrant_retriever = qdrant_vectorstore.as_retriever()
123
  state.set_retriever(qdrant_retriever)
124
+ print("Vector store created")
125
  return qdrant_retriever