DrishtiSharma commited on
Commit
5e05c49
Β·
verified Β·
1 Parent(s): 9c3c191

Update interim.py

Browse files
Files changed (1) hide show
  1. interim.py +73 -126
interim.py CHANGED
@@ -1,13 +1,15 @@
1
  import os
 
2
  import requests
3
  import streamlit as st
4
- from langchain.chains import SequentialChain, LLMChain
5
  from langchain.prompts import PromptTemplate
6
  from langchain_groq import ChatGroq
7
  from langchain.document_loaders import PDFPlumberLoader
8
  from langchain_experimental.text_splitter import SemanticChunker
9
  from langchain_huggingface import HuggingFaceEmbeddings
10
  from langchain_chroma import Chroma
 
11
 
12
  # Set API Keys
13
  os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
@@ -16,74 +18,104 @@ os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
16
  llm_judge = ChatGroq(model="deepseek-r1-distill-llama-70b")
17
  rag_llm = ChatGroq(model="mixtral-8x7b-32768")
18
 
19
- st.title("❓")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  # Step 1: Choose PDF Source
22
- #### Initialize pdf_path
23
- pdf_path = None
24
- pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0)
25
 
26
  if pdf_source == "Upload a PDF file":
27
  uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
28
  if uploaded_file:
29
- with open("temp.pdf", "wb") as f:
 
30
  f.write(uploaded_file.getbuffer())
31
- pdf_path = "temp.pdf"
 
 
32
 
33
  elif pdf_source == "Enter a PDF URL":
34
- pdf_url = st.text_input("Enter PDF URL:")
35
- if pdf_url:
36
  with st.spinner("Downloading PDF..."):
37
  try:
38
  response = requests.get(pdf_url)
39
  if response.status_code == 200:
40
- with open("temp.pdf", "wb") as f:
 
41
  f.write(response.content)
42
- pdf_path = "temp.pdf"
 
 
43
  st.success("βœ… PDF Downloaded Successfully!")
44
  else:
45
  st.error("❌ Failed to download PDF. Check the URL.")
46
- pdf_path = None
47
  except Exception as e:
48
  st.error(f"Error downloading PDF: {e}")
49
- pdf_path = None
50
- else:
51
- pdf_path = None
52
 
53
  # Step 2: Process PDF
54
- if pdf_path:
55
- with st.spinner("Loading PDF..."):
56
- loader = PDFPlumberLoader(pdf_path)
57
  docs = loader.load()
58
-
59
- st.success(f"βœ… **PDF Loaded!** Total Pages: {len(docs)}")
 
60
 
61
- # Step 3: Chunking
 
62
  with st.spinner("Chunking the document..."):
63
  model_name = "nomic-ai/modernbert-embed-base"
64
- embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={'device': 'cpu'})
65
-
66
  text_splitter = SemanticChunker(embedding_model)
67
- documents = text_splitter.split_documents(docs)
 
 
 
68
 
69
- st.success(f"βœ… **Document Chunked!** Total Chunks: {len(documents)}")
70
-
71
- # Step 4: Setup Vectorstore
72
  with st.spinner("Creating vector store..."):
73
  vector_store = Chroma(
74
  collection_name="deepseek_collection",
75
  collection_metadata={"hnsw:space": "cosine"},
76
- embedding_function=embedding_model
 
77
  )
78
- vector_store.add_documents(documents)
79
-
80
- st.success("βœ… **Vector Store Created!**")
81
-
82
- # Step 5: Query Input
 
 
 
83
  query = st.text_input("πŸ” Enter a Query:")
 
84
  if query:
85
  with st.spinner("Retrieving relevant contexts..."):
86
- retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
87
  contexts = retriever.invoke(query)
88
  context_texts = [doc.page_content for doc in contexts]
89
 
@@ -91,96 +123,11 @@ if pdf_path:
91
  for i, text in enumerate(context_texts, 1):
92
  st.write(f"**Context {i}:** {text[:500]}...")
93
 
94
- # Step 6: Context Relevancy Checker
95
- with st.spinner("Evaluating context relevancy..."):
96
- relevancy_prompt = PromptTemplate(
97
- input_variables=["retriever_query", "context"],
98
- template="""You are an expert judge. Assign relevancy scores (0 or 1) for each context to answer the query.
99
-
100
- CONTEXT LIST:
101
- {context}
102
-
103
- QUERY:
104
- {retriever_query}
105
-
106
- RESPONSE (JSON):
107
- [{{"content": 1, "score": <0 or 1>, "reasoning": "<explanation>"}},
108
- {{"content": 2, "score": <0 or 1>, "reasoning": "<explanation>"}},
109
- ...]"""
110
- )
111
- context_relevancy_chain = LLMChain(llm=llm_judge, prompt=relevancy_prompt, output_key="relevancy_response")
112
- relevancy_response = context_relevancy_chain.invoke({"context": context_texts, "retriever_query": query})
113
-
114
- st.success("βœ… **Context Relevancy Evaluated!**")
115
- st.json(relevancy_response['relevancy_response'])
116
-
117
- # Step 7: Selecting Relevant Contexts
118
- with st.spinner("Selecting the most relevant contexts..."):
119
- relevant_prompt = PromptTemplate(
120
- input_variables=["relevancy_response"],
121
- template="""Extract contexts with score 0 from the relevancy response.
122
-
123
- RELEVANCY RESPONSE:
124
- {relevancy_response}
125
-
126
- RESPONSE (JSON):
127
- [{{"content": <content number>}}]
128
- """
129
- )
130
- pick_relevant_context_chain = LLMChain(llm=llm_judge, prompt=relevant_prompt, output_key="context_number")
131
- relevant_response = pick_relevant_context_chain.invoke({"relevancy_response": relevancy_response['relevancy_response']})
132
-
133
- st.success("βœ… **Relevant Contexts Selected!**")
134
- st.json(relevant_response['context_number'])
135
-
136
- # Step 8: Retrieving Context for Response Generation
137
- with st.spinner("Retrieving final context..."):
138
- context_prompt = PromptTemplate(
139
- input_variables=["context_number", "context"],
140
- template="""Extract actual content for the selected context numbers.
141
-
142
- CONTEXT NUMBERS:
143
- {context_number}
144
-
145
- CONTENT LIST:
146
- {context}
147
-
148
- RESPONSE (JSON):
149
- [{{"context_number": <content number>, "relevant_content": "<actual context>"}}]
150
- """
151
- )
152
- relevant_contexts_chain = LLMChain(llm=llm_judge, prompt=context_prompt, output_key="relevant_contexts")
153
- final_contexts = relevant_contexts_chain.invoke({"context_number": relevant_response['context_number'], "context": context_texts})
154
-
155
- st.success("βœ… **Final Contexts Retrieved!**")
156
- st.json(final_contexts['relevant_contexts'])
157
-
158
- # Step 9: Generate Final Response
159
  with st.spinner("Generating the final answer..."):
160
- rag_prompt = PromptTemplate(
161
- input_variables=["query", "context"],
162
- template="""Generate a clear, fact-based response based on the context.
163
-
164
- QUERY:
165
- {query}
166
-
167
- CONTEXT:
168
- {context}
169
-
170
- ANSWER:
171
- """
172
- )
173
- response_chain = LLMChain(llm=rag_llm, prompt=rag_prompt, output_key="final_response")
174
- final_response = response_chain.invoke({"query": query, "context": final_contexts['relevant_contexts']})
175
-
176
- st.success("βœ… **Final Response Generated!**")
177
- st.success(final_response['final_response'])
178
-
179
- # Step 10: Display Workflow Breakdown
180
- st.write("πŸ” **Workflow Breakdown:**")
181
- st.json({
182
- "Context Relevancy Evaluation": relevancy_response["relevancy_response"],
183
- "Relevant Contexts": relevant_response["context_number"],
184
- "Extracted Contexts": final_contexts["relevant_contexts"],
185
- "Final Answer": final_response["final_response"]
186
- })
 
1
  import os
2
+ import chromadb
3
  import requests
4
  import streamlit as st
5
+ from langchain.chains import LLMChain
6
  from langchain.prompts import PromptTemplate
7
  from langchain_groq import ChatGroq
8
  from langchain.document_loaders import PDFPlumberLoader
9
  from langchain_experimental.text_splitter import SemanticChunker
10
  from langchain_huggingface import HuggingFaceEmbeddings
11
  from langchain_chroma import Chroma
12
+ from prompts import rag_prompt, relevancy_prompt, relevant_context_picker_prompt, response_synth
13
 
14
  # Set API Keys
15
  os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
 
18
  llm_judge = ChatGroq(model="deepseek-r1-distill-llama-70b")
19
  rag_llm = ChatGroq(model="mixtral-8x7b-32768")
20
 
21
+ llm_judge.verbose = True
22
+ rag_llm.verbose = True
23
+
24
+ # Clear ChromaDB cache to fix tenant issue
25
+ chromadb.api.client.SharedSystemClient.clear_system_cache()
26
+
27
+ st.title("Blah")
28
+
29
+ # **Initialize session state variables**
30
+ if "pdf_path" not in st.session_state:
31
+ st.session_state.pdf_path = None
32
+ if "pdf_loaded" not in st.session_state:
33
+ st.session_state.pdf_loaded = False
34
+ if "chunked" not in st.session_state:
35
+ st.session_state.chunked = False
36
+ if "vector_created" not in st.session_state:
37
+ st.session_state.vector_created = False
38
+ if "vector_store_path" not in st.session_state:
39
+ st.session_state.vector_store_path = "./chroma_langchain_db"
40
+ if "vector_store" not in st.session_state:
41
+ st.session_state.vector_store = None
42
+ if "documents" not in st.session_state:
43
+ st.session_state.documents = None
44
 
45
  # Step 1: Choose PDF Source
46
+ pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
 
 
47
 
48
  if pdf_source == "Upload a PDF file":
49
  uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
50
  if uploaded_file:
51
+ st.session_state.pdf_path = "temp.pdf"
52
+ with open(st.session_state.pdf_path, "wb") as f:
53
  f.write(uploaded_file.getbuffer())
54
+ st.session_state.pdf_loaded = False
55
+ st.session_state.chunked = False
56
+ st.session_state.vector_created = False
57
 
58
  elif pdf_source == "Enter a PDF URL":
59
+ pdf_url = st.text_input("Enter PDF URL:", value="https://arxiv.org/pdf/2406.06998")
60
+ if pdf_url and st.session_state.pdf_path is None:
61
  with st.spinner("Downloading PDF..."):
62
  try:
63
  response = requests.get(pdf_url)
64
  if response.status_code == 200:
65
+ st.session_state.pdf_path = "temp.pdf"
66
+ with open(st.session_state.pdf_path, "wb") as f:
67
  f.write(response.content)
68
+ st.session_state.pdf_loaded = False
69
+ st.session_state.chunked = False
70
+ st.session_state.vector_created = False
71
  st.success("βœ… PDF Downloaded Successfully!")
72
  else:
73
  st.error("❌ Failed to download PDF. Check the URL.")
 
74
  except Exception as e:
75
  st.error(f"Error downloading PDF: {e}")
 
 
 
76
 
77
  # Step 2: Process PDF
78
+ if st.session_state.pdf_path and not st.session_state.pdf_loaded:
79
+ with st.spinner("Loading and processing PDF..."):
80
+ loader = PDFPlumberLoader(st.session_state.pdf_path)
81
  docs = loader.load()
82
+ st.session_state.documents = docs
83
+ st.session_state.pdf_loaded = True
84
+ st.success(f"βœ… **PDF Loaded!** Total Pages: {len(docs)}")
85
 
86
+ # Step 3: Chunking
87
+ if st.session_state.pdf_loaded and not st.session_state.chunked and st.session_state.documents:
88
  with st.spinner("Chunking the document..."):
89
  model_name = "nomic-ai/modernbert-embed-base"
90
+ embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': False})
 
91
  text_splitter = SemanticChunker(embedding_model)
92
+ documents = text_splitter.split_documents(st.session_state.documents)
93
+ st.session_state.documents = documents # Store chunked docs
94
+ st.session_state.chunked = True
95
+ st.success(f"βœ… **Document Chunked!** Total Chunks: {len(documents)}")
96
 
97
+ # Step 4: Setup Vectorstore
98
+ if st.session_state.chunked and not st.session_state.vector_created:
 
99
  with st.spinner("Creating vector store..."):
100
  vector_store = Chroma(
101
  collection_name="deepseek_collection",
102
  collection_metadata={"hnsw:space": "cosine"},
103
+ embedding_function=embedding_model,
104
+ persist_directory=st.session_state.vector_store_path
105
  )
106
+ vector_store.add_documents(st.session_state.documents)
107
+ num_documents = len(vector_store.get()["documents"])
108
+ st.session_state.vector_store = vector_store
109
+ st.session_state.vector_created = True
110
+ st.success(f"βœ… **Vector Store Created!** Total documents stored: {num_documents}")
111
+
112
+ # Step 5: Query Input
113
+ if st.session_state.vector_created and st.session_state.vector_store:
114
  query = st.text_input("πŸ” Enter a Query:")
115
+
116
  if query:
117
  with st.spinner("Retrieving relevant contexts..."):
118
+ retriever = st.session_state.vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
119
  contexts = retriever.invoke(query)
120
  context_texts = [doc.page_content for doc in contexts]
121
 
 
123
  for i, text in enumerate(context_texts, 1):
124
  st.write(f"**Context {i}:** {text[:500]}...")
125
 
126
+ # **Step 6: Generate Final Response**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  with st.spinner("Generating the final answer..."):
128
+ final_prompt = PromptTemplate(input_variables=["query", "context"], template=rag_prompt)
129
+ response_chain = LLMChain(llm=rag_llm, prompt=final_prompt, output_key="final_response")
130
+ final_response = response_chain.invoke({"query": query, "context": context_texts})
131
+
132
+ st.subheader("πŸŸ₯ RAG Final Response")
133
+ st.success(final_response['final_response'])