umangchaudhry commited on
Commit
a93e85e
·
verified ·
1 Parent(s): 8af57c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +309 -74
app.py CHANGED
@@ -1,20 +1,29 @@
1
  import os
 
2
  import streamlit as st
3
  from tempfile import NamedTemporaryFile
 
 
 
4
  from langchain.chains import create_retrieval_chain
5
  from langchain.chains.combine_documents import create_stuff_documents_chain
6
  from langchain_core.prompts import ChatPromptTemplate
7
- from langchain_openai import ChatOpenAI
8
- from langchain_community.document_loaders import PyPDFLoader
9
- from langchain_community.document_loaders import TextLoader
10
  from langchain_community.vectorstores import FAISS
11
- from langchain_openai import OpenAIEmbeddings
12
  from langchain_text_splitters import RecursiveCharacterTextSplitter
13
- import re
14
- import anthropic
15
 
16
  # Function to remove code block markers from the answer
17
  def remove_code_blocks(text):
 
 
 
 
 
 
 
 
 
18
  code_block_pattern = r"^```(?:\w+)?\n(.*?)\n```$"
19
  match = re.match(code_block_pattern, text, re.DOTALL)
20
  if match:
@@ -24,29 +33,48 @@ def remove_code_blocks(text):
24
 
25
  # Function to process PDF, run Q&A, and return results
26
  def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  os.environ["OPENAI_API_KEY"] = api_key
28
 
 
29
  with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
30
  temp_pdf.write(uploaded_file.read())
31
  temp_pdf_path = temp_pdf.name
32
 
 
33
  loader = PyPDFLoader(temp_pdf_path)
34
  docs = loader.load()
35
-
36
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
37
  splits = text_splitter.split_documents(docs)
38
 
 
39
  vectorstore = FAISS.from_documents(
40
- documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-large")
 
41
  )
42
  retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
43
 
 
44
  if os.path.exists(prompt_path):
45
  with open(prompt_path, "r") as file:
46
  system_prompt = file.read()
47
  else:
48
  raise FileNotFoundError(f"The specified file was not found: {prompt_path}")
49
 
 
50
  prompt = ChatPromptTemplate.from_messages(
51
  [
52
  ("system", system_prompt),
@@ -54,38 +82,60 @@ def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_pla
54
  ]
55
  )
56
 
 
57
  llm = ChatOpenAI(model="gpt-4o")
58
- question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
 
 
 
 
59
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
60
 
 
61
  if os.path.exists(questions_path):
62
  with open(questions_path, "r") as file:
63
  questions = [line.strip() for line in file.readlines() if line.strip()]
64
  else:
65
  raise FileNotFoundError(f"The specified file was not found: {questions_path}")
66
 
 
67
  qa_results = []
68
  for question in questions:
69
  result = rag_chain.invoke({"input": question})
70
  answer = result["answer"]
71
 
 
72
  answer = remove_code_blocks(answer)
73
 
74
  qa_text = f"### Question: {question}\n**Answer:**\n{answer}\n"
75
  qa_results.append(qa_text)
76
  display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True)
77
 
 
78
  os.remove(temp_pdf_path)
79
 
80
  return qa_results
81
 
82
- # New function to process multi-plan QA using an existing vector store
83
  def process_multi_plan_qa(api_key, input_text, display_placeholder):
 
 
 
 
 
 
 
 
 
84
  os.environ["OPENAI_API_KEY"] = api_key
85
 
86
  # Load the existing vector store
87
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
88
- vector_store = FAISS.load_local("Combined_Summary_Vectorstore", embeddings, allow_dangerous_deserialization=True)
 
 
 
 
89
 
90
  # Convert the vector store to a retriever
91
  retriever = vector_store.as_retriever(search_kwargs={"k": 50})
@@ -108,7 +158,9 @@ def process_multi_plan_qa(api_key, input_text, display_placeholder):
108
 
109
  # Create the question-answering chain
110
  llm = ChatOpenAI(model="gpt-4o")
111
- question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
 
 
112
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
113
 
114
  # Process the input text
@@ -118,14 +170,27 @@ def process_multi_plan_qa(api_key, input_text, display_placeholder):
118
  # Display the answer
119
  display_placeholder.markdown(f"**Answer:**\n{answer}")
120
 
121
- def multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
 
 
 
 
 
 
 
 
 
 
122
  os.environ["OPENAI_API_KEY"] = api_key
123
 
124
  # Directory containing individual vector stores
125
  vectorstore_directory = "Individual_Summary_Vectorstores"
126
 
127
  # List all vector store directories
128
- vectorstore_names = [d for d in os.listdir(vectorstore_directory) if os.path.isdir(os.path.join(vectorstore_directory, d))]
 
 
 
129
 
130
  # Initialize a list to collect all retrieved chunks
131
  all_retrieved_chunks = []
@@ -136,13 +201,17 @@ def multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
136
 
137
  # Load the vector store
138
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
139
- vector_store = FAISS.load_local(vectorstore_path, embeddings, allow_dangerous_deserialization=True)
 
 
 
 
140
 
141
  # Convert the vector store to a retriever
142
  retriever = vector_store.as_retriever(search_kwargs={"k": 2})
143
 
144
  # Retrieve relevant chunks for the input text
145
- retrieved_chunks = retriever.invoke("input_text")
146
  all_retrieved_chunks.extend(retrieved_chunks)
147
 
148
  # Read the system prompt for multi-document QA
@@ -163,16 +232,30 @@ def multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
163
 
164
  # Create the question-answering chain
165
  llm = ChatOpenAI(model="gpt-4o")
166
- question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
 
 
167
 
168
  # Process the combined context
169
- result = question_answer_chain.invoke({"input": input_text, "context": all_retrieved_chunks})
 
 
 
170
 
171
  # Display the answer
172
- display_placeholder.markdown(f"**Answer:**\n{result}")
173
-
174
 
175
  def load_documents_from_pdf(file):
 
 
 
 
 
 
 
 
 
176
  # Check if the file is a PDF
177
  if not file.name.endswith('.pdf'):
178
  raise ValueError("The uploaded file is not a PDF. Please upload a PDF file.")
@@ -187,51 +270,84 @@ def load_documents_from_pdf(file):
187
  return docs
188
 
189
  def load_vector_store_from_path(path):
190
- embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
191
- return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
192
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
- # Function to compare document via one-to-many query approach
195
  def process_one_to_many_query(api_key, focus_input, comparison_inputs, input_text, display_placeholder):
 
 
 
 
 
 
 
 
 
 
 
196
  os.environ["OPENAI_API_KEY"] = api_key
197
  print(comparison_inputs)
198
  # Load focus documents or vector store
199
  if isinstance(focus_input, st.runtime.uploaded_file_manager.UploadedFile):
 
200
  focus_docs = load_documents_from_pdf(focus_input)
201
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
202
  focus_splits = text_splitter.split_documents(focus_docs)
203
- focus_vector_store = FAISS.from_documents(focus_splits, OpenAIEmbeddings(model="text-embedding-3-large"))
 
 
 
204
  focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
205
  elif isinstance(focus_input, str) and os.path.isdir(focus_input):
 
206
  focus_vector_store = load_vector_store_from_path(focus_input)
207
  focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
208
  else:
209
  raise ValueError("Invalid focus input type. Must be a PDF file or a path to a vector store.")
210
 
 
211
  focus_docs = focus_retriever.invoke(input_text)
212
 
 
213
  comparison_chunks = []
214
  for comparison_input in comparison_inputs:
215
  if isinstance(comparison_input, st.runtime.uploaded_file_manager.UploadedFile):
 
216
  comparison_docs = load_documents_from_pdf(comparison_input)
217
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
218
  comparison_splits = text_splitter.split_documents(comparison_docs)
219
- comparison_vector_store = FAISS.from_documents(comparison_splits, OpenAIEmbeddings(model="text-embedding-3-large"))
 
 
 
220
  comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
221
  elif isinstance(comparison_input, str) and os.path.isdir(comparison_input):
 
222
  comparison_vector_store = load_vector_store_from_path(comparison_input)
223
  comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
224
  else:
225
  raise ValueError("Invalid comparison input type. Must be a PDF file or a path to a vector store.")
226
 
 
227
  comparison_docs = comparison_retriever.invoke(input_text)
228
  comparison_chunks.extend(comparison_docs)
229
 
230
  # Construct the combined context
231
- combined_context = (
232
- focus_docs +
233
- comparison_chunks
234
- )
235
 
236
  # Read the system prompt
237
  prompt_path = "Prompts/comparison_prompt.md"
@@ -252,7 +368,7 @@ def process_one_to_many_query(api_key, focus_input, comparison_inputs, input_tex
252
  # Create the question-answering chain
253
  llm = ChatOpenAI(model="gpt-4o")
254
  question_answer_chain = create_stuff_documents_chain(
255
- llm,
256
  prompt,
257
  document_variable_name="context"
258
  )
@@ -264,35 +380,67 @@ def process_one_to_many_query(api_key, focus_input, comparison_inputs, input_tex
264
  })
265
 
266
  # Display the answer
267
- display_placeholder.markdown(f"**Answer:**\n{result}")
 
268
 
269
  # Function to list vector store documents
270
  def list_vector_store_documents():
 
 
 
 
 
 
271
  # Assuming documents are stored in the "Individual_All_Vectorstores" directory
272
  directory_path = "Individual_All_Vectorstores"
273
  if not os.path.exists(directory_path):
274
- raise FileNotFoundError(f"The directory '{directory_path}' does not exist. Run `create_and_save_individual_vector_stores()` to create it.")
 
 
 
275
  # List all available vector stores by document name
276
- documents = [f.replace("_vectorstore", "").replace("_", " ") for f in os.listdir(directory_path) if f.endswith("_vectorstore")]
 
 
 
 
277
  return documents
278
 
 
279
  def compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan_path, selected_summaries, display_placeholder):
 
 
 
 
 
 
 
 
 
 
 
 
280
  os.environ["OPENAI_API_KEY"] = api_key
281
  os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key
282
- # Load the focus plan
283
 
284
- # Load focus documents or vector store
285
  if isinstance(focus_plan_path, st.runtime.uploaded_file_manager.UploadedFile):
 
286
  focus_docs = load_documents_from_pdf(focus_plan_path)
287
  elif isinstance(focus_plan_path, str):
 
288
  focus_loader = PyPDFLoader(focus_plan_path)
289
  focus_docs = focus_loader.load()
 
 
290
 
291
  # Concatenate selected summary documents
292
  summaries_directory = "CAPS_Summaries"
293
  summaries_content = ""
294
  for filename in selected_summaries:
295
- with open(os.path.join(summaries_directory, f"{filename.replace(" Summary", "_Summary")}.md"), 'r') as file:
 
 
296
  summaries_content += file.read() + "\n\n"
297
 
298
  # Prepare the context
@@ -300,17 +448,15 @@ def compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan
300
 
301
  # Create the client and message
302
  client = anthropic.Anthropic(api_key=anthropic_api_key)
303
- message = client.messages.create(
304
- model="claude-3-5-sonnet-20241022",
305
- max_tokens=1024,
306
- messages=[
307
- {"role": "user", "content": f"{input_text}\n\nFocus Document:\n{focus_context}\n\nSummaries:\n{summaries_content}"}
308
- ]
309
  )
310
 
311
  # Display the answer
312
- display_placeholder.markdown(f"**Answer:**\n{message.content}", unsafe_allow_html=True)
313
-
314
 
315
  # Streamlit app layout with tabs
316
  st.title("Climate Policy Analysis Tool")
@@ -319,11 +465,21 @@ st.title("Climate Policy Analysis Tool")
319
  api_key = st.text_input("Enter your OpenAI API key:", type="password", key="openai_key")
320
 
321
  # Create tabs
322
- tab1, tab2, tab3, tab4, tab5 = st.tabs(["Summary Generation", "Multi-Plan QA (Shared Vectorstore)", "Multi-Plan QA (Multi-Vectorstore)", "Plan Comparison Tool", "Plan Comparison with Long Context Model"])
 
 
 
 
 
 
323
 
324
  # First tab: Summary Generation
325
  with tab1:
326
- uploaded_file = st.file_uploader("Upload a Climate Action Plan in PDF format", type="pdf", key="upload_file")
 
 
 
 
327
 
328
  prompt_file_path = "Prompts/summary_tool_system_prompt.md"
329
  questions_file_path = "Prompts/summary_tool_questions.md"
@@ -337,14 +493,19 @@ with tab1:
337
  display_placeholder = st.empty()
338
  with st.spinner("Processing..."):
339
  try:
340
- results = process_pdf(api_key, uploaded_file, questions_file_path, prompt_file_path, display_placeholder)
341
-
 
 
 
 
 
342
  markdown_text = "\n".join(results)
343
-
344
  # Use the uploaded file's name for the download file
345
  base_name = os.path.splitext(uploaded_file.name)[0]
346
  download_file_name = f"{base_name}_Summary.md"
347
-
348
  st.download_button(
349
  label="Download Results as Markdown",
350
  data=markdown_text,
@@ -355,7 +516,7 @@ with tab1:
355
  except Exception as e:
356
  st.error(f"An error occurred: {e}")
357
 
358
- # Second tab: Multi-Plan QA
359
  with tab2:
360
  input_text = st.text_input("Ask a question:", key="multi_plan_input")
361
  if st.button("Ask", key="multi_plan_qa_button"):
@@ -375,7 +536,7 @@ with tab2:
375
  except Exception as e:
376
  st.error(f"An error occurred: {e}")
377
 
378
-
379
  with tab3:
380
  user_input = st.text_input("Ask a question:", key="multi_vectorstore_input")
381
  if st.button("Ask", key="multi_vectorstore_qa_button"):
@@ -387,7 +548,7 @@ with tab3:
387
  display_placeholder3 = st.empty()
388
  with st.spinner("Processing..."):
389
  try:
390
- multi_plan_qa_multi_vectorstore(
391
  api_key,
392
  user_input,
393
  display_placeholder3
@@ -403,10 +564,18 @@ with tab4:
403
  vectorstore_documents = list_vector_store_documents()
404
 
405
  # Option to upload a new plan or select from existing vector stores
406
- focus_option = st.radio("Choose a focus plan:", ("Select from existing vector stores", "Upload a new plan"), key="focus_option")
 
 
 
 
407
 
408
  if focus_option == "Upload a new plan":
409
- focus_uploaded_file = st.file_uploader("Upload a Climate Action Plan to compare", type="pdf", key="focus_upload")
 
 
 
 
410
  if focus_uploaded_file is not None:
411
  # Directly use the uploaded file
412
  focus_input = focus_uploaded_file
@@ -414,21 +583,49 @@ with tab4:
414
  focus_input = None
415
  else:
416
  # Select a focus plan from existing vector stores
417
- selected_focus_plan = st.selectbox("Select a focus plan:", vectorstore_documents, key="select_focus_plan")
418
- focus_input = os.path.join("Individual_All_Vectorstores", f"{selected_focus_plan.replace(" Summary", "_Summary")}_vectorstore")
 
 
 
 
 
 
 
419
 
420
  # Option to upload comparison documents or select from existing vector stores
421
- comparison_option = st.radio("Choose comparison documents:", ("Select from existing vector stores", "Upload new documents"), key="comparison_option")
 
 
 
 
422
 
423
  if comparison_option == "Upload new documents":
424
- comparison_files = st.file_uploader("Upload comparison documents", type="pdf", accept_multiple_files=True, key="comparison_files")
 
 
 
 
 
425
  comparison_inputs = comparison_files
426
  else:
427
  # Select comparison documents from existing vector stores
428
- selected_comparison_plans = st.multiselect("Select comparison documents:", vectorstore_documents, key="select_comparison_plans")
429
- comparison_inputs = [os.path.join("Individual_All_Vectorstores", f"{doc.replace(" Summary", "_Summary")}_vectorstore") for doc in selected_comparison_plans]
 
 
 
 
 
 
 
 
 
430
 
431
- input_text = st.text_input("Ask a comparison question:", key="comparison_input")
 
 
 
432
 
433
  if st.button("Compare", key="compare_button"):
434
  if not api_key:
@@ -444,8 +641,13 @@ with tab4:
444
  with st.spinner("Processing..."):
445
  try:
446
  # Call the process_one_to_many_query function
447
- process_one_to_many_query(api_key, focus_input, comparison_inputs, input_text, display_placeholder4)
448
-
 
 
 
 
 
449
  except Exception as e:
450
  st.error(f"An error occurred: {e}")
451
 
@@ -454,30 +656,56 @@ with tab5:
454
  st.header("Plan Comparison with Long Context Model")
455
 
456
  # Anthropics API Key Input
457
- anthropic_api_key = st.text_input("Enter your Anthropic API key:", type="password", key="anthropic_key")
 
 
 
 
458
 
459
  # Option to upload a new plan or select from a list
460
- focus_option = st.radio("Choose a focus plan:", ("Select from existing plans", "Upload a new plan"), key="focus_option_long_context")
 
 
 
 
461
 
462
  if focus_option == "Upload a new plan":
463
- focus_uploaded_file = st.file_uploader("Upload a Climate Action Plan to compare", type="pdf", key="focus_upload_long_context")
 
 
 
 
464
  if focus_uploaded_file is not None:
465
  # Directly use the uploaded file
466
  focus_plan_path = focus_uploaded_file
467
  else:
468
  focus_plan_path = None
469
  else:
470
- # Select a focus plan from existing vector stores
471
  plan_list = [f.replace(".pdf", "") for f in os.listdir("CAPS") if f.endswith('.pdf')]
472
- selected_focus_plan = st.selectbox("Select a focus plan:", plan_list, key="select_focus_plan_long_context")
 
 
 
 
473
  focus_plan_path = os.path.join("CAPS", f"{selected_focus_plan}.pdf")
474
 
475
  # List available summary documents for selection
476
  summaries_directory = "CAPS_Summaries"
477
- summary_files = [f.replace(".md", "").replace("_", " ") for f in os.listdir(summaries_directory) if f.endswith('.md')]
478
- selected_summaries = st.multiselect("Select summary documents for comparison:", summary_files, key="selected_summaries")
 
 
 
 
 
 
 
479
 
480
- input_text = st.text_input("Ask a comparison question:", key="comparison_input_long_context")
 
 
 
481
 
482
  if st.button("Compare with Long Context", key="compare_button_long_context"):
483
  if not api_key:
@@ -492,6 +720,13 @@ with tab5:
492
  display_placeholder = st.empty()
493
  with st.spinner("Processing..."):
494
  try:
495
- compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan_path, selected_summaries, display_placeholder)
 
 
 
 
 
 
 
496
  except Exception as e:
497
- st.error(f"An error occurred: {e}")
 
1
  import os
2
+ import re
3
  import streamlit as st
4
  from tempfile import NamedTemporaryFile
5
+ import anthropic
6
+
7
+ # Import necessary modules from LangChain
8
  from langchain.chains import create_retrieval_chain
9
  from langchain.chains.combine_documents import create_stuff_documents_chain
10
  from langchain_core.prompts import ChatPromptTemplate
11
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
12
+ from langchain_community.document_loaders import PyPDFLoader, TextLoader
 
13
  from langchain_community.vectorstores import FAISS
 
14
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
 
15
 
16
  # Function to remove code block markers from the answer
17
  def remove_code_blocks(text):
18
+ """
19
+ Removes code block markers from the answer text.
20
+
21
+ Args:
22
+ text (str): The text from which code block markers should be removed.
23
+
24
+ Returns:
25
+ str: The text without code block markers.
26
+ """
27
  code_block_pattern = r"^```(?:\w+)?\n(.*?)\n```$"
28
  match = re.match(code_block_pattern, text, re.DOTALL)
29
  if match:
 
33
 
34
  # Function to process PDF, run Q&A, and return results
35
  def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder):
36
+ """
37
+ Processes a PDF file, runs Q&A, and returns the results.
38
+
39
+ Args:
40
+ api_key (str): OpenAI API key.
41
+ uploaded_file: Uploaded PDF file.
42
+ questions_path (str): Path to the questions file.
43
+ prompt_path (str): Path to the system prompt file.
44
+ display_placeholder: Streamlit placeholder for displaying results.
45
+
46
+ Returns:
47
+ list: List of QA results.
48
+ """
49
+ # Set the OpenAI API key
50
  os.environ["OPENAI_API_KEY"] = api_key
51
 
52
+ # Save the uploaded PDF to a temporary file
53
  with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
54
  temp_pdf.write(uploaded_file.read())
55
  temp_pdf_path = temp_pdf.name
56
 
57
+ # Load and split the PDF into documents
58
  loader = PyPDFLoader(temp_pdf_path)
59
  docs = loader.load()
 
60
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
61
  splits = text_splitter.split_documents(docs)
62
 
63
+ # Create a vector store from the documents
64
  vectorstore = FAISS.from_documents(
65
+ documents=splits,
66
+ embedding=OpenAIEmbeddings(model="text-embedding-3-large")
67
  )
68
  retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
69
 
70
+ # Load the system prompt
71
  if os.path.exists(prompt_path):
72
  with open(prompt_path, "r") as file:
73
  system_prompt = file.read()
74
  else:
75
  raise FileNotFoundError(f"The specified file was not found: {prompt_path}")
76
 
77
+ # Create the prompt template
78
  prompt = ChatPromptTemplate.from_messages(
79
  [
80
  ("system", system_prompt),
 
82
  ]
83
  )
84
 
85
+ # Initialize the language model
86
  llm = ChatOpenAI(model="gpt-4o")
87
+
88
+ # Create the question-answering chain
89
+ question_answer_chain = create_stuff_documents_chain(
90
+ llm, prompt, document_variable_name="context"
91
+ )
92
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
93
 
94
+ # Load the questions
95
  if os.path.exists(questions_path):
96
  with open(questions_path, "r") as file:
97
  questions = [line.strip() for line in file.readlines() if line.strip()]
98
  else:
99
  raise FileNotFoundError(f"The specified file was not found: {questions_path}")
100
 
101
+ # Process each question
102
  qa_results = []
103
  for question in questions:
104
  result = rag_chain.invoke({"input": question})
105
  answer = result["answer"]
106
 
107
+ # Remove code block markers
108
  answer = remove_code_blocks(answer)
109
 
110
  qa_text = f"### Question: {question}\n**Answer:**\n{answer}\n"
111
  qa_results.append(qa_text)
112
  display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True)
113
 
114
+ # Clean up temporary PDF file
115
  os.remove(temp_pdf_path)
116
 
117
  return qa_results
118
 
119
+ # Function to perform multi-plan QA using an existing vector store
120
  def process_multi_plan_qa(api_key, input_text, display_placeholder):
121
+ """
122
+ Performs multi-plan QA using an existing shared vector store.
123
+
124
+ Args:
125
+ api_key (str): OpenAI API key.
126
+ input_text (str): The question to ask.
127
+ display_placeholder: Streamlit placeholder for displaying results.
128
+ """
129
+ # Set the OpenAI API key
130
  os.environ["OPENAI_API_KEY"] = api_key
131
 
132
  # Load the existing vector store
133
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
134
+ vector_store = FAISS.load_local(
135
+ "Combined_Summary_Vectorstore",
136
+ embeddings,
137
+ allow_dangerous_deserialization=True
138
+ )
139
 
140
  # Convert the vector store to a retriever
141
  retriever = vector_store.as_retriever(search_kwargs={"k": 50})
 
158
 
159
  # Create the question-answering chain
160
  llm = ChatOpenAI(model="gpt-4o")
161
+ question_answer_chain = create_stuff_documents_chain(
162
+ llm, prompt, document_variable_name="context"
163
+ )
164
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
165
 
166
  # Process the input text
 
170
  # Display the answer
171
  display_placeholder.markdown(f"**Answer:**\n{answer}")
172
 
173
+ # Function to perform multi-plan QA using multiple individual vector stores
174
+ def process_multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
175
+ """
176
+ Performs multi-plan QA using multiple individual vector stores.
177
+
178
+ Args:
179
+ api_key (str): OpenAI API key.
180
+ input_text (str): The question to ask.
181
+ display_placeholder: Streamlit placeholder for displaying results.
182
+ """
183
+ # Set the OpenAI API key
184
  os.environ["OPENAI_API_KEY"] = api_key
185
 
186
  # Directory containing individual vector stores
187
  vectorstore_directory = "Individual_Summary_Vectorstores"
188
 
189
  # List all vector store directories
190
+ vectorstore_names = [
191
+ d for d in os.listdir(vectorstore_directory)
192
+ if os.path.isdir(os.path.join(vectorstore_directory, d))
193
+ ]
194
 
195
  # Initialize a list to collect all retrieved chunks
196
  all_retrieved_chunks = []
 
201
 
202
  # Load the vector store
203
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
204
+ vector_store = FAISS.load_local(
205
+ vectorstore_path,
206
+ embeddings,
207
+ allow_dangerous_deserialization=True
208
+ )
209
 
210
  # Convert the vector store to a retriever
211
  retriever = vector_store.as_retriever(search_kwargs={"k": 2})
212
 
213
  # Retrieve relevant chunks for the input text
214
+ retrieved_chunks = retriever.invoke(input_text)
215
  all_retrieved_chunks.extend(retrieved_chunks)
216
 
217
  # Read the system prompt for multi-document QA
 
232
 
233
  # Create the question-answering chain
234
  llm = ChatOpenAI(model="gpt-4o")
235
+ question_answer_chain = create_stuff_documents_chain(
236
+ llm, prompt, document_variable_name="context"
237
+ )
238
 
239
  # Process the combined context
240
+ result = question_answer_chain.invoke({
241
+ "input": input_text,
242
+ "context": all_retrieved_chunks
243
+ })
244
 
245
  # Display the answer
246
+ answer = result["answer"] if "answer" in result else result
247
+ display_placeholder.markdown(f"**Answer:**\n{answer}")
248
 
249
  def load_documents_from_pdf(file):
250
+ """
251
+ Loads documents from a PDF file.
252
+
253
+ Args:
254
+ file: Uploaded PDF file.
255
+
256
+ Returns:
257
+ list: List of documents.
258
+ """
259
  # Check if the file is a PDF
260
  if not file.name.endswith('.pdf'):
261
  raise ValueError("The uploaded file is not a PDF. Please upload a PDF file.")
 
270
  return docs
271
 
272
  def load_vector_store_from_path(path):
273
+ """
274
+ Loads a vector store from a given path.
275
 
276
+ Args:
277
+ path (str): Path to the vector store.
278
+
279
+ Returns:
280
+ FAISS: Loaded vector store.
281
+ """
282
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
283
+ return FAISS.load_local(
284
+ path,
285
+ embeddings,
286
+ allow_dangerous_deserialization=True
287
+ )
288
 
289
+ # Function to compare documents via one-to-many query approach
290
  def process_one_to_many_query(api_key, focus_input, comparison_inputs, input_text, display_placeholder):
291
+ """
292
+ Compares a focus document against multiple comparison documents using a one-to-many query approach.
293
+
294
+ Args:
295
+ api_key (str): OpenAI API key.
296
+ focus_input: Focus document (uploaded file or path to vector store).
297
+ comparison_inputs: List of comparison documents (uploaded files or paths to vector stores).
298
+ input_text (str): The comparison question to ask.
299
+ display_placeholder: Streamlit placeholder for displaying results.
300
+ """
301
+ # Set the OpenAI API key
302
  os.environ["OPENAI_API_KEY"] = api_key
303
  print(comparison_inputs)
304
  # Load focus documents or vector store
305
  if isinstance(focus_input, st.runtime.uploaded_file_manager.UploadedFile):
306
+ # If focus_input is an uploaded PDF file
307
  focus_docs = load_documents_from_pdf(focus_input)
308
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
309
  focus_splits = text_splitter.split_documents(focus_docs)
310
+ focus_vector_store = FAISS.from_documents(
311
+ focus_splits,
312
+ OpenAIEmbeddings(model="text-embedding-3-large")
313
+ )
314
  focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
315
  elif isinstance(focus_input, str) and os.path.isdir(focus_input):
316
+ # If focus_input is a path to a vector store
317
  focus_vector_store = load_vector_store_from_path(focus_input)
318
  focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
319
  else:
320
  raise ValueError("Invalid focus input type. Must be a PDF file or a path to a vector store.")
321
 
322
+ # Retrieve relevant chunks from the focus document
323
  focus_docs = focus_retriever.invoke(input_text)
324
 
325
+ # Initialize list to collect comparison chunks
326
  comparison_chunks = []
327
  for comparison_input in comparison_inputs:
328
  if isinstance(comparison_input, st.runtime.uploaded_file_manager.UploadedFile):
329
+ # If comparison_input is an uploaded PDF file
330
  comparison_docs = load_documents_from_pdf(comparison_input)
331
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
332
  comparison_splits = text_splitter.split_documents(comparison_docs)
333
+ comparison_vector_store = FAISS.from_documents(
334
+ comparison_splits,
335
+ OpenAIEmbeddings(model="text-embedding-3-large")
336
+ )
337
  comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
338
  elif isinstance(comparison_input, str) and os.path.isdir(comparison_input):
339
+ # If comparison_input is a path to a vector store
340
  comparison_vector_store = load_vector_store_from_path(comparison_input)
341
  comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
342
  else:
343
  raise ValueError("Invalid comparison input type. Must be a PDF file or a path to a vector store.")
344
 
345
+ # Retrieve relevant chunks from the comparison document
346
  comparison_docs = comparison_retriever.invoke(input_text)
347
  comparison_chunks.extend(comparison_docs)
348
 
349
  # Construct the combined context
350
+ combined_context = focus_docs + comparison_chunks
 
 
 
351
 
352
  # Read the system prompt
353
  prompt_path = "Prompts/comparison_prompt.md"
 
368
  # Create the question-answering chain
369
  llm = ChatOpenAI(model="gpt-4o")
370
  question_answer_chain = create_stuff_documents_chain(
371
+ llm,
372
  prompt,
373
  document_variable_name="context"
374
  )
 
380
  })
381
 
382
  # Display the answer
383
+ answer = result["answer"] if "answer" in result else result
384
+ display_placeholder.markdown(f"**Answer:**\n{answer}")
385
 
386
  # Function to list vector store documents
387
  def list_vector_store_documents():
388
+ """
389
+ Lists available vector store documents.
390
+
391
+ Returns:
392
+ list: List of document names.
393
+ """
394
  # Assuming documents are stored in the "Individual_All_Vectorstores" directory
395
  directory_path = "Individual_All_Vectorstores"
396
  if not os.path.exists(directory_path):
397
+ raise FileNotFoundError(
398
+ f"The directory '{directory_path}' does not exist. "
399
+ "Run `create_and_save_individual_vector_stores()` to create it."
400
+ )
401
  # List all available vector stores by document name
402
+ documents = [
403
+ f.replace("_vectorstore", "").replace("_", " ")
404
+ for f in os.listdir(directory_path)
405
+ if f.endswith("_vectorstore")
406
+ ]
407
  return documents
408
 
409
+ # Function to compare plans using a long context model
410
  def compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan_path, selected_summaries, display_placeholder):
411
+ """
412
+ Compares plans using a long context model.
413
+
414
+ Args:
415
+ api_key (str): OpenAI API key.
416
+ anthropic_api_key (str): Anthropic API key.
417
+ input_text (str): The comparison question to ask.
418
+ focus_plan_path: Path to the focus plan or uploaded file.
419
+ selected_summaries (list): List of selected summary documents.
420
+ display_placeholder: Streamlit placeholder for displaying results.
421
+ """
422
+ # Set the API keys
423
  os.environ["OPENAI_API_KEY"] = api_key
424
  os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key
 
425
 
426
+ # Load focus documents
427
  if isinstance(focus_plan_path, st.runtime.uploaded_file_manager.UploadedFile):
428
+ # If focus_plan_path is an uploaded file
429
  focus_docs = load_documents_from_pdf(focus_plan_path)
430
  elif isinstance(focus_plan_path, str):
431
+ # If focus_plan_path is a file path
432
  focus_loader = PyPDFLoader(focus_plan_path)
433
  focus_docs = focus_loader.load()
434
+ else:
435
+ raise ValueError("Invalid focus plan input type. Must be an uploaded file or a file path.")
436
 
437
  # Concatenate selected summary documents
438
  summaries_directory = "CAPS_Summaries"
439
  summaries_content = ""
440
  for filename in selected_summaries:
441
+ # Fix the filename by replacing ' Summary' with '_Summary'
442
+ summary_filename = f"{filename.replace(' Summary', '_Summary')}.md"
443
+ with open(os.path.join(summaries_directory, summary_filename), 'r') as file:
444
  summaries_content += file.read() + "\n\n"
445
 
446
  # Prepare the context
 
448
 
449
  # Create the client and message
450
  client = anthropic.Anthropic(api_key=anthropic_api_key)
451
+ response = client.completions.create(
452
+ model="claude-2",
453
+ max_tokens_to_sample=1024,
454
+ prompt=f"{input_text}\n\nFocus Document:\n{focus_context}\n\nSummaries:\n{summaries_content}"
 
 
455
  )
456
 
457
  # Display the answer
458
+ answer = response.completion
459
+ display_placeholder.markdown(f"**Answer:**\n{answer}", unsafe_allow_html=True)
460
 
461
  # Streamlit app layout with tabs
462
  st.title("Climate Policy Analysis Tool")
 
465
  api_key = st.text_input("Enter your OpenAI API key:", type="password", key="openai_key")
466
 
467
  # Create tabs
468
+ tab1, tab2, tab3, tab4, tab5 = st.tabs([
469
+ "Summary Generation",
470
+ "Multi-Plan QA (Shared Vectorstore)",
471
+ "Multi-Plan QA (Multi-Vectorstore)",
472
+ "Plan Comparison Tool",
473
+ "Plan Comparison with Long Context Model"
474
+ ])
475
 
476
  # First tab: Summary Generation
477
  with tab1:
478
+ uploaded_file = st.file_uploader(
479
+ "Upload a Climate Action Plan in PDF format",
480
+ type="pdf",
481
+ key="upload_file"
482
+ )
483
 
484
  prompt_file_path = "Prompts/summary_tool_system_prompt.md"
485
  questions_file_path = "Prompts/summary_tool_questions.md"
 
493
  display_placeholder = st.empty()
494
  with st.spinner("Processing..."):
495
  try:
496
+ results = process_pdf(
497
+ api_key,
498
+ uploaded_file,
499
+ questions_file_path,
500
+ prompt_file_path,
501
+ display_placeholder
502
+ )
503
  markdown_text = "\n".join(results)
504
+
505
  # Use the uploaded file's name for the download file
506
  base_name = os.path.splitext(uploaded_file.name)[0]
507
  download_file_name = f"{base_name}_Summary.md"
508
+
509
  st.download_button(
510
  label="Download Results as Markdown",
511
  data=markdown_text,
 
516
  except Exception as e:
517
  st.error(f"An error occurred: {e}")
518
 
519
+ # Second tab: Multi-Plan QA (Shared Vectorstore)
520
  with tab2:
521
  input_text = st.text_input("Ask a question:", key="multi_plan_input")
522
  if st.button("Ask", key="multi_plan_qa_button"):
 
536
  except Exception as e:
537
  st.error(f"An error occurred: {e}")
538
 
539
+ # Third tab: Multi-Plan QA (Multi-Vectorstore)
540
  with tab3:
541
  user_input = st.text_input("Ask a question:", key="multi_vectorstore_input")
542
  if st.button("Ask", key="multi_vectorstore_qa_button"):
 
548
  display_placeholder3 = st.empty()
549
  with st.spinner("Processing..."):
550
  try:
551
+ process_multi_plan_qa_multi_vectorstore(
552
  api_key,
553
  user_input,
554
  display_placeholder3
 
564
  vectorstore_documents = list_vector_store_documents()
565
 
566
  # Option to upload a new plan or select from existing vector stores
567
+ focus_option = st.radio(
568
+ "Choose a focus plan:",
569
+ ("Select from existing vector stores", "Upload a new plan"),
570
+ key="focus_option"
571
+ )
572
 
573
  if focus_option == "Upload a new plan":
574
+ focus_uploaded_file = st.file_uploader(
575
+ "Upload a Climate Action Plan to compare",
576
+ type="pdf",
577
+ key="focus_upload"
578
+ )
579
  if focus_uploaded_file is not None:
580
  # Directly use the uploaded file
581
  focus_input = focus_uploaded_file
 
583
  focus_input = None
584
  else:
585
  # Select a focus plan from existing vector stores
586
+ selected_focus_plan = st.selectbox(
587
+ "Select a focus plan:",
588
+ vectorstore_documents,
589
+ key="select_focus_plan"
590
+ )
591
+ focus_input = os.path.join(
592
+ "Individual_All_Vectorstores",
593
+ f"{selected_focus_plan.replace(' Summary', '_Summary')}_vectorstore"
594
+ )
595
 
596
  # Option to upload comparison documents or select from existing vector stores
597
+ comparison_option = st.radio(
598
+ "Choose comparison documents:",
599
+ ("Select from existing vector stores", "Upload new documents"),
600
+ key="comparison_option"
601
+ )
602
 
603
  if comparison_option == "Upload new documents":
604
+ comparison_files = st.file_uploader(
605
+ "Upload comparison documents",
606
+ type="pdf",
607
+ accept_multiple_files=True,
608
+ key="comparison_files"
609
+ )
610
  comparison_inputs = comparison_files
611
  else:
612
  # Select comparison documents from existing vector stores
613
+ selected_comparison_plans = st.multiselect(
614
+ "Select comparison documents:",
615
+ vectorstore_documents,
616
+ key="select_comparison_plans"
617
+ )
618
+ comparison_inputs = [
619
+ os.path.join(
620
+ "Individual_All_Vectorstores",
621
+ f"{doc.replace(' Summary', '_Summary')}_vectorstore"
622
+ ) for doc in selected_comparison_plans
623
+ ]
624
 
625
+ input_text = st.text_input(
626
+ "Ask a comparison question:",
627
+ key="comparison_input"
628
+ )
629
 
630
  if st.button("Compare", key="compare_button"):
631
  if not api_key:
 
641
  with st.spinner("Processing..."):
642
  try:
643
  # Call the process_one_to_many_query function
644
+ process_one_to_many_query(
645
+ api_key,
646
+ focus_input,
647
+ comparison_inputs,
648
+ input_text,
649
+ display_placeholder4
650
+ )
651
  except Exception as e:
652
  st.error(f"An error occurred: {e}")
653
 
 
656
  st.header("Plan Comparison with Long Context Model")
657
 
658
  # Anthropics API Key Input
659
+ anthropic_api_key = st.text_input(
660
+ "Enter your Anthropic API key:",
661
+ type="password",
662
+ key="anthropic_key"
663
+ )
664
 
665
  # Option to upload a new plan or select from a list
666
+ focus_option = st.radio(
667
+ "Choose a focus plan:",
668
+ ("Select from existing plans", "Upload a new plan"),
669
+ key="focus_option_long_context"
670
+ )
671
 
672
  if focus_option == "Upload a new plan":
673
+ focus_uploaded_file = st.file_uploader(
674
+ "Upload a Climate Action Plan to compare",
675
+ type="pdf",
676
+ key="focus_upload_long_context"
677
+ )
678
  if focus_uploaded_file is not None:
679
  # Directly use the uploaded file
680
  focus_plan_path = focus_uploaded_file
681
  else:
682
  focus_plan_path = None
683
  else:
684
+ # List of existing plans in CAPS
685
  plan_list = [f.replace(".pdf", "") for f in os.listdir("CAPS") if f.endswith('.pdf')]
686
+ selected_focus_plan = st.selectbox(
687
+ "Select a focus plan:",
688
+ plan_list,
689
+ key="select_focus_plan_long_context"
690
+ )
691
  focus_plan_path = os.path.join("CAPS", f"{selected_focus_plan}.pdf")
692
 
693
  # List available summary documents for selection
694
  summaries_directory = "CAPS_Summaries"
695
+ summary_files = [
696
+ f.replace(".md", "").replace("_", " ")
697
+ for f in os.listdir(summaries_directory) if f.endswith('.md')
698
+ ]
699
+ selected_summaries = st.multiselect(
700
+ "Select summary documents for comparison:",
701
+ summary_files,
702
+ key="selected_summaries"
703
+ )
704
 
705
+ input_text = st.text_input(
706
+ "Ask a comparison question:",
707
+ key="comparison_input_long_context"
708
+ )
709
 
710
  if st.button("Compare with Long Context", key="compare_button_long_context"):
711
  if not api_key:
 
720
  display_placeholder = st.empty()
721
  with st.spinner("Processing..."):
722
  try:
723
+ compare_with_long_context(
724
+ api_key,
725
+ anthropic_api_key,
726
+ input_text,
727
+ focus_plan_path,
728
+ selected_summaries,
729
+ display_placeholder
730
+ )
731
  except Exception as e:
732
+ st.error(f"An error occurred: {e}")