umangchaudhry commited on
Commit
832b728
·
verified ·
1 Parent(s): ab3e741

testing o1-preview refactored code

Browse files
Files changed (1) hide show
  1. app.py +396 -106
app.py CHANGED
@@ -1,20 +1,29 @@
1
  import os
 
2
  import streamlit as st
3
  from tempfile import NamedTemporaryFile
 
 
 
4
  from langchain.chains import create_retrieval_chain
5
  from langchain.chains.combine_documents import create_stuff_documents_chain
6
  from langchain_core.prompts import ChatPromptTemplate
7
- from langchain_openai import ChatOpenAI
8
- from langchain_community.document_loaders import PyPDFLoader
9
- from langchain_community.document_loaders import TextLoader
10
  from langchain_community.vectorstores import FAISS
11
- from langchain_openai import OpenAIEmbeddings
12
  from langchain_text_splitters import RecursiveCharacterTextSplitter
13
- import re
14
- import anthropic
15
 
16
  # Function to remove code block markers from the answer
17
  def remove_code_blocks(text):
 
 
 
 
 
 
 
 
 
18
  code_block_pattern = r"^```(?:\w+)?\n(.*?)\n```$"
19
  match = re.match(code_block_pattern, text, re.DOTALL)
20
  if match:
@@ -23,30 +32,49 @@ def remove_code_blocks(text):
23
  return text
24
 
25
  # Function to process PDF, run Q&A, and return results
26
- def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  os.environ["OPENAI_API_KEY"] = api_key
28
 
 
29
  with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
30
  temp_pdf.write(uploaded_file.read())
31
  temp_pdf_path = temp_pdf.name
32
 
 
33
  loader = PyPDFLoader(temp_pdf_path)
34
  docs = loader.load()
35
-
36
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
37
  splits = text_splitter.split_documents(docs)
38
 
 
39
  vectorstore = FAISS.from_documents(
40
- documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-large")
 
41
  )
42
  retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
43
 
 
44
  if os.path.exists(prompt_path):
45
  with open(prompt_path, "r") as file:
46
  system_prompt = file.read()
47
  else:
48
  raise FileNotFoundError(f"The specified file was not found: {prompt_path}")
49
 
 
50
  prompt = ChatPromptTemplate.from_messages(
51
  [
52
  ("system", system_prompt),
@@ -54,38 +82,60 @@ def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_pla
54
  ]
55
  )
56
 
 
57
  llm = ChatOpenAI(model="gpt-4o")
58
- question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
 
 
 
 
59
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
60
 
 
61
  if os.path.exists(questions_path):
62
  with open(questions_path, "r") as file:
63
  questions = [line.strip() for line in file.readlines() if line.strip()]
64
  else:
65
  raise FileNotFoundError(f"The specified file was not found: {questions_path}")
66
 
 
67
  qa_results = []
68
  for question in questions:
69
  result = rag_chain.invoke({"input": question})
70
  answer = result["answer"]
71
 
 
72
  answer = remove_code_blocks(answer)
73
 
74
  qa_text = f"### Question: {question}\n**Answer:**\n{answer}\n"
75
  qa_results.append(qa_text)
76
  display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True)
77
 
 
78
  os.remove(temp_pdf_path)
79
 
80
  return qa_results
81
 
82
- # New function to process multi-plan QA using an existing vector store
83
- def process_multi_plan_qa(api_key, input_text, display_placeholder):
 
 
 
 
 
 
 
 
 
84
  os.environ["OPENAI_API_KEY"] = api_key
85
 
86
  # Load the existing vector store
87
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
88
- vector_store = FAISS.load_local("Combined_Summary_Vectorstore", embeddings, allow_dangerous_deserialization=True)
 
 
 
 
89
 
90
  # Convert the vector store to a retriever
91
  retriever = vector_store.as_retriever(search_kwargs={"k": 50})
@@ -108,7 +158,9 @@ def process_multi_plan_qa(api_key, input_text, display_placeholder):
108
 
109
  # Create the question-answering chain
110
  llm = ChatOpenAI(model="gpt-4o")
111
- question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
 
 
112
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
113
 
114
  # Process the input text
@@ -118,14 +170,27 @@ def process_multi_plan_qa(api_key, input_text, display_placeholder):
118
  # Display the answer
119
  display_placeholder.markdown(f"**Answer:**\n{answer}")
120
 
121
- def multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
 
 
 
 
 
 
 
 
 
 
122
  os.environ["OPENAI_API_KEY"] = api_key
123
 
124
  # Directory containing individual vector stores
125
  vectorstore_directory = "Individual_Summary_Vectorstores"
126
 
127
  # List all vector store directories
128
- vectorstore_names = [d for d in os.listdir(vectorstore_directory) if os.path.isdir(os.path.join(vectorstore_directory, d))]
 
 
 
129
 
130
  # Initialize a list to collect all retrieved chunks
131
  all_retrieved_chunks = []
@@ -136,13 +201,17 @@ def multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
136
 
137
  # Load the vector store
138
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
139
- vector_store = FAISS.load_local(vectorstore_path, embeddings, allow_dangerous_deserialization=True)
 
 
 
 
140
 
141
  # Convert the vector store to a retriever
142
  retriever = vector_store.as_retriever(search_kwargs={"k": 2})
143
 
144
  # Retrieve relevant chunks for the input text
145
- retrieved_chunks = retriever.invoke("input_text")
146
  all_retrieved_chunks.extend(retrieved_chunks)
147
 
148
  # Read the system prompt for multi-document QA
@@ -163,20 +232,45 @@ def multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
163
 
164
  # Create the question-answering chain
165
  llm = ChatOpenAI(model="gpt-4o")
166
- question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
 
 
167
 
168
  # Process the combined context
169
- result = question_answer_chain.invoke({"input": input_text, "context": all_retrieved_chunks})
 
 
 
170
 
171
  # Display the answer
172
- display_placeholder.markdown(f"**Answer:**\n{result}")
173
-
174
 
175
- # Function to compare document via one-to-many query approach
176
- def process_one_to_many_query(api_key, focus_input, comparison_inputs, input_text, display_placeholder):
 
 
 
 
 
 
 
 
 
 
 
177
  os.environ["OPENAI_API_KEY"] = api_key
178
 
179
  def load_documents_from_pdf(file):
 
 
 
 
 
 
 
 
 
180
  with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
181
  temp_pdf.write(file.read())
182
  temp_pdf_path = temp_pdf.name
@@ -187,46 +281,69 @@ def process_one_to_many_query(api_key, focus_input, comparison_inputs, input_tex
187
  return docs
188
 
189
  def load_vector_store_from_path(path):
 
 
 
 
 
 
 
 
 
190
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
191
- return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
 
 
 
 
192
 
193
  # Load focus documents or vector store
194
  if isinstance(focus_input, st.runtime.uploaded_file_manager.UploadedFile):
 
195
  focus_docs = load_documents_from_pdf(focus_input)
196
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
197
  focus_splits = text_splitter.split_documents(focus_docs)
198
- focus_vector_store = FAISS.from_documents(focus_splits, OpenAIEmbeddings(model="text-embedding-3-large"))
 
 
 
199
  focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
200
  elif isinstance(focus_input, str) and os.path.isdir(focus_input):
 
201
  focus_vector_store = load_vector_store_from_path(focus_input)
202
  focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
203
  else:
204
  raise ValueError("Invalid focus input type. Must be a PDF file or a path to a vector store.")
205
 
 
206
  focus_docs = focus_retriever.invoke(input_text)
207
 
 
208
  comparison_chunks = []
209
  for comparison_input in comparison_inputs:
210
  if isinstance(comparison_input, st.runtime.uploaded_file_manager.UploadedFile):
 
211
  comparison_docs = load_documents_from_pdf(comparison_input)
212
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
213
  comparison_splits = text_splitter.split_documents(comparison_docs)
214
- comparison_vector_store = FAISS.from_documents(comparison_splits, OpenAIEmbeddings(model="text-embedding-3-large"))
 
 
 
215
  comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
216
  elif isinstance(comparison_input, str) and os.path.isdir(comparison_input):
 
217
  comparison_vector_store = load_vector_store_from_path(comparison_input)
218
  comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
219
  else:
220
  raise ValueError("Invalid comparison input type. Must be a PDF file or a path to a vector store.")
221
 
 
222
  comparison_docs = comparison_retriever.invoke(input_text)
223
  comparison_chunks.extend(comparison_docs)
224
 
225
  # Construct the combined context
226
- combined_context = (
227
- focus_docs +
228
- comparison_chunks
229
- )
230
 
231
  # Read the system prompt
232
  prompt_path = "Prompts/comparison_prompt.md"
@@ -247,7 +364,7 @@ def process_one_to_many_query(api_key, focus_input, comparison_inputs, input_tex
247
  # Create the question-answering chain
248
  llm = ChatOpenAI(model="gpt-4o")
249
  question_answer_chain = create_stuff_documents_chain(
250
- llm,
251
  prompt,
252
  document_variable_name="context"
253
  )
@@ -259,19 +376,47 @@ def process_one_to_many_query(api_key, focus_input, comparison_inputs, input_tex
259
  })
260
 
261
  # Display the answer
262
- display_placeholder.markdown(f"**Answer:**\n{result}")
 
263
 
264
  # Function to list vector store documents
265
  def list_vector_store_documents():
 
 
 
 
 
 
266
  # Assuming documents are stored in the "Individual_All_Vectorstores" directory
267
  directory_path = "Individual_All_Vectorstores"
268
  if not os.path.exists(directory_path):
269
- raise FileNotFoundError(f"The directory '{directory_path}' does not exist. Run `create_and_save_individual_vector_stores()` to create it.")
 
 
 
270
  # List all available vector stores by document name
271
- documents = [f.replace("_vectorstore", "").replace("_", " ") for f in os.listdir(directory_path) if f.endswith("_vectorstore")]
 
 
 
 
272
  return documents
273
 
274
- def compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan_path, focus_city_name, selected_summaries, display_placeholder):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  os.environ["OPENAI_API_KEY"] = api_key
276
  os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key
277
 
@@ -309,7 +454,6 @@ def compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan
309
  # Display the answer
310
  display_placeholder.markdown(f"**Answer:**\n{message.content}", unsafe_allow_html=True)
311
 
312
-
313
  # Streamlit app layout with tabs
314
  st.title("Climate Policy Analysis Tool")
315
 
@@ -317,50 +461,96 @@ st.title("Climate Policy Analysis Tool")
317
  api_key = st.text_input("Enter your OpenAI API key:", type="password", key="openai_key")
318
 
319
  # Create tabs
320
- tab1, tab2, tab3, tab4, tab5 = st.tabs(["Summary Generation", "Multi-Plan QA (Shared Vectorstore)", "Multi-Plan QA (Multi-Vectorstore)", "Plan Comparison Tool", "Plan Comparison with Long Context Model"])
 
 
 
 
 
 
321
 
322
  # First tab: Summary Generation
323
  with tab1:
324
- uploaded_file = st.file_uploader("Upload a Climate Action Plan in PDF format", type="pdf", key="upload_file")
 
 
 
 
325
 
326
  prompt_file_path = "Prompts/summary_tool_system_prompt.md"
327
  questions_file_path = "Prompts/summary_tool_questions.md"
328
 
329
- if st.button("Generate", key="generate_button") and api_key and uploaded_file:
330
- display_placeholder = st.empty()
331
-
332
- with st.spinner("Processing..."):
333
- try:
334
- results = process_pdf(api_key, uploaded_file, questions_file_path, prompt_file_path, display_placeholder)
335
-
336
- markdown_text = "\n".join(results)
337
-
338
- # Use the uploaded file's name for the download file
339
- base_name = os.path.splitext(uploaded_file.name)[0]
340
- download_file_name = f"{base_name}_Summary.md"
341
-
342
- st.download_button(
343
- label="Download Results as Markdown",
344
- data=markdown_text,
345
- file_name=download_file_name,
346
- mime="text/markdown",
347
- key="download_button"
348
- )
349
- except Exception as e:
350
- st.error(f"An error occurred: {e}")
351
-
352
- # Second tab: Multi-Plan QA
 
 
 
 
 
 
 
 
 
353
  with tab2:
354
  input_text = st.text_input("Ask a question:", key="multi_plan_input")
355
- if input_text and api_key:
356
- display_placeholder2 = st.empty()
357
- process_multi_plan_qa(api_key, input_text, display_placeholder2)
358
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  with tab3:
360
  user_input = st.text_input("Ask a Question", key="multi_vectorstore_input")
361
- if user_input and api_key:
362
- display_placeholder3 = st.empty()
363
- multi_plan_qa_multi_vectorstore(api_key, user_input, display_placeholder3)
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
365
  # Fourth tab: Plan Comparison Tool
366
  with tab4:
@@ -370,11 +560,22 @@ with tab4:
370
  vectorstore_documents = list_vector_store_documents()
371
 
372
  # Option to upload a new plan or select from existing vector stores
373
- focus_option = st.radio("Choose a focus plan:", ("Select from existing vector stores", "Upload a new plan"), key="focus_option")
 
 
 
 
374
 
375
  if focus_option == "Upload a new plan":
376
- focus_uploaded_file = st.file_uploader("Upload a Climate Action Plan to compare", type="pdf", key="focus_upload")
377
- focus_city_name = st.text_input("Enter the city name for the uploaded plan:", key="focus_city_name")
 
 
 
 
 
 
 
378
  if focus_uploaded_file is not None and focus_city_name:
379
  # Directly use the uploaded file
380
  focus_input = focus_uploaded_file
@@ -382,46 +583,102 @@ with tab4:
382
  focus_input = None
383
  else:
384
  # Select a focus plan from existing vector stores
385
- selected_focus_plan = st.selectbox("Select a focus plan:", vectorstore_documents, key="select_focus_plan")
386
- focus_input = os.path.join("Individual_All_Vectorstores", f"{selected_focus_plan}_vectorstore")
 
 
 
 
 
 
 
387
  focus_city_name = selected_focus_plan.replace("_", " ")
388
 
389
  # Option to upload comparison documents or select from existing vector stores
390
- comparison_option = st.radio("Choose comparison documents:", ("Select from existing vector stores", "Upload new documents"), key="comparison_option")
 
 
 
 
391
 
392
  if comparison_option == "Upload new documents":
393
- comparison_files = st.file_uploader("Upload comparison documents", type="pdf", accept_multiple_files=True, key="comparison_files")
 
 
 
 
 
394
  comparison_inputs = comparison_files
395
  else:
396
  # Select comparison documents from existing vector stores
397
- selected_comparison_plans = st.multiselect("Select comparison documents:", vectorstore_documents, key="select_comparison_plans")
398
- comparison_inputs = [os.path.join("Individual_All_Vectorstores", f"{doc}_vectorstore") for doc in selected_comparison_plans]
 
 
 
 
 
 
 
 
 
399
 
400
- input_text = st.text_input("Ask a comparison question:", key="comparison_input")
 
 
 
401
 
402
- if st.button("Compare", key="compare_button") and api_key and input_text and focus_input and comparison_inputs:
403
- display_placeholder4 = st.empty()
404
- with st.spinner("Processing..."):
405
- try:
406
- # Call the process_one_to_many_query function
407
- process_one_to_many_query(api_key, focus_input, comparison_inputs, input_text, display_placeholder4)
408
-
409
- except Exception as e:
410
- st.error(f"An error occurred: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
411
 
412
  # Fifth tab: Plan Comparison with Long Context Model
413
  with tab5:
414
  st.header("Plan Comparison with Long Context Model")
415
 
416
  # Anthropics API Key Input
417
- anthropic_api_key = st.text_input("Enter your Anthropic API key:", type="password", key="anthropic_key")
 
 
 
 
418
 
419
  # Option to upload a new plan or select from a list
420
- upload_option = st.radio("Choose a focus plan:", ("Select from existing plans", "Upload a new plan"), key="upload_option_long_context")
 
 
 
 
421
 
422
  if upload_option == "Upload a new plan":
423
- focus_uploaded_file = st.file_uploader("Upload a Climate Action Plan to compare", type="pdf", key="focus_upload_long_context")
424
- focus_city_name = st.text_input("Enter the city name for the uploaded plan:", key="focus_city_name_long_context")
 
 
 
 
 
 
 
425
  if focus_uploaded_file is not None and focus_city_name:
426
  # Save uploaded file temporarily
427
  with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
@@ -432,22 +689,55 @@ with tab5:
432
  else:
433
  # List of existing plans in CAPS
434
  plan_list = [f.replace(".pdf", "") for f in os.listdir("CAPS") if f.endswith('.pdf')]
435
- selected_plan = st.selectbox("Select a plan:", plan_list, key="selected_plan_long_context")
 
 
 
 
436
  focus_plan_path = os.path.join("CAPS", selected_plan)
437
  # Extract city name from the file name
438
  focus_city_name = os.path.splitext(selected_plan)[0].replace("_", " ")
439
 
440
  # List available summary documents for selection
441
  summaries_directory = "CAPS_Summaries"
442
- summary_files = [f.replace(".md", "").replace("_", " ") for f in os.listdir(summaries_directory) if f.endswith('.md')]
443
- selected_summaries = st.multiselect("Select summary documents for comparison:", summary_files, key="selected_summaries")
444
-
445
- input_text = st.text_input("Ask a comparison question:", key="comparison_input_long_context")
446
-
447
- if st.button("Compare with Long Context", key="compare_button_long_context") and api_key and anthropic_api_key and input_text and focus_plan_path and focus_city_name:
448
- display_placeholder = st.empty()
449
- with st.spinner("Processing..."):
450
- try:
451
- compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan_path, focus_city_name, selected_summaries, display_placeholder)
452
- except Exception as e:
453
- st.error(f"An error occurred: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import re
3
  import streamlit as st
4
  from tempfile import NamedTemporaryFile
5
+ import anthropic
6
+
7
+ # Import necessary modules from LangChain
8
  from langchain.chains import create_retrieval_chain
9
  from langchain.chains.combine_documents import create_stuff_documents_chain
10
  from langchain_core.prompts import ChatPromptTemplate
11
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
12
+ from langchain_community.document_loaders import PyPDFLoader, TextLoader
 
13
  from langchain_community.vectorstores import FAISS
 
14
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
 
15
 
16
  # Function to remove code block markers from the answer
17
  def remove_code_blocks(text):
18
+ """
19
+ Removes code block markers from the answer text.
20
+
21
+ Args:
22
+ text (str): The text from which code block markers should be removed.
23
+
24
+ Returns:
25
+ str: The text without code block markers.
26
+ """
27
  code_block_pattern = r"^```(?:\w+)?\n(.*?)\n```$"
28
  match = re.match(code_block_pattern, text, re.DOTALL)
29
  if match:
 
32
  return text
33
 
34
  # Function to process PDF, run Q&A, and return results
35
+ def generate_summary_from_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder):
36
+ """
37
+ Processes a PDF file, runs Q&A, and returns the results.
38
+
39
+ Args:
40
+ api_key (str): OpenAI API key.
41
+ uploaded_file: Uploaded PDF file.
42
+ questions_path (str): Path to the questions file.
43
+ prompt_path (str): Path to the system prompt file.
44
+ display_placeholder: Streamlit placeholder for displaying results.
45
+
46
+ Returns:
47
+ list: List of QA results.
48
+ """
49
+ # Set the OpenAI API key
50
  os.environ["OPENAI_API_KEY"] = api_key
51
 
52
+ # Save the uploaded PDF to a temporary file
53
  with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
54
  temp_pdf.write(uploaded_file.read())
55
  temp_pdf_path = temp_pdf.name
56
 
57
+ # Load and split the PDF into documents
58
  loader = PyPDFLoader(temp_pdf_path)
59
  docs = loader.load()
 
60
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
61
  splits = text_splitter.split_documents(docs)
62
 
63
+ # Create a vector store from the documents
64
  vectorstore = FAISS.from_documents(
65
+ documents=splits,
66
+ embedding=OpenAIEmbeddings(model="text-embedding-3-large")
67
  )
68
  retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
69
 
70
+ # Load the system prompt
71
  if os.path.exists(prompt_path):
72
  with open(prompt_path, "r") as file:
73
  system_prompt = file.read()
74
  else:
75
  raise FileNotFoundError(f"The specified file was not found: {prompt_path}")
76
 
77
+ # Create the prompt template
78
  prompt = ChatPromptTemplate.from_messages(
79
  [
80
  ("system", system_prompt),
 
82
  ]
83
  )
84
 
85
+ # Initialize the language model
86
  llm = ChatOpenAI(model="gpt-4o")
87
+
88
+ # Create the question-answering chain
89
+ question_answer_chain = create_stuff_documents_chain(
90
+ llm, prompt, document_variable_name="context"
91
+ )
92
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
93
 
94
+ # Load the questions
95
  if os.path.exists(questions_path):
96
  with open(questions_path, "r") as file:
97
  questions = [line.strip() for line in file.readlines() if line.strip()]
98
  else:
99
  raise FileNotFoundError(f"The specified file was not found: {questions_path}")
100
 
101
+ # Process each question
102
  qa_results = []
103
  for question in questions:
104
  result = rag_chain.invoke({"input": question})
105
  answer = result["answer"]
106
 
107
+ # Remove code block markers
108
  answer = remove_code_blocks(answer)
109
 
110
  qa_text = f"### Question: {question}\n**Answer:**\n{answer}\n"
111
  qa_results.append(qa_text)
112
  display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True)
113
 
114
+ # Clean up temporary PDF file
115
  os.remove(temp_pdf_path)
116
 
117
  return qa_results
118
 
119
+ # Function to perform multi-plan QA using an existing shared vector store
120
+ def perform_multi_plan_qa_shared_vectorstore(api_key, input_text, display_placeholder):
121
+ """
122
+ Performs multi-plan QA using an existing shared vector store.
123
+
124
+ Args:
125
+ api_key (str): OpenAI API key.
126
+ input_text (str): The question to ask.
127
+ display_placeholder: Streamlit placeholder for displaying results.
128
+ """
129
+ # Set the OpenAI API key
130
  os.environ["OPENAI_API_KEY"] = api_key
131
 
132
  # Load the existing vector store
133
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
134
+ vector_store = FAISS.load_local(
135
+ "Combined_Summary_Vectorstore",
136
+ embeddings,
137
+ allow_dangerous_deserialization=True
138
+ )
139
 
140
  # Convert the vector store to a retriever
141
  retriever = vector_store.as_retriever(search_kwargs={"k": 50})
 
158
 
159
  # Create the question-answering chain
160
  llm = ChatOpenAI(model="gpt-4o")
161
+ question_answer_chain = create_stuff_documents_chain(
162
+ llm, prompt, document_variable_name="context"
163
+ )
164
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
165
 
166
  # Process the input text
 
170
  # Display the answer
171
  display_placeholder.markdown(f"**Answer:**\n{answer}")
172
 
173
+ # Function to perform multi-plan QA using multiple individual vector stores
174
+ def perform_multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
175
+ """
176
+ Performs multi-plan QA using multiple individual vector stores.
177
+
178
+ Args:
179
+ api_key (str): OpenAI API key.
180
+ input_text (str): The question to ask.
181
+ display_placeholder: Streamlit placeholder for displaying results.
182
+ """
183
+ # Set the OpenAI API key
184
  os.environ["OPENAI_API_KEY"] = api_key
185
 
186
  # Directory containing individual vector stores
187
  vectorstore_directory = "Individual_Summary_Vectorstores"
188
 
189
  # List all vector store directories
190
+ vectorstore_names = [
191
+ d for d in os.listdir(vectorstore_directory)
192
+ if os.path.isdir(os.path.join(vectorstore_directory, d))
193
+ ]
194
 
195
  # Initialize a list to collect all retrieved chunks
196
  all_retrieved_chunks = []
 
201
 
202
  # Load the vector store
203
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
204
+ vector_store = FAISS.load_local(
205
+ vectorstore_path,
206
+ embeddings,
207
+ allow_dangerous_deserialization=True
208
+ )
209
 
210
  # Convert the vector store to a retriever
211
  retriever = vector_store.as_retriever(search_kwargs={"k": 2})
212
 
213
  # Retrieve relevant chunks for the input text
214
+ retrieved_chunks = retriever.invoke(input_text)
215
  all_retrieved_chunks.extend(retrieved_chunks)
216
 
217
  # Read the system prompt for multi-document QA
 
232
 
233
  # Create the question-answering chain
234
  llm = ChatOpenAI(model="gpt-4o")
235
+ question_answer_chain = create_stuff_documents_chain(
236
+ llm, prompt, document_variable_name="context"
237
+ )
238
 
239
  # Process the combined context
240
+ result = question_answer_chain.invoke({
241
+ "input": input_text,
242
+ "context": all_retrieved_chunks
243
+ })
244
 
245
  # Display the answer
246
+ answer = result["answer"] if "answer" in result else result
247
+ display_placeholder.markdown(f"**Answer:**\n{answer}")
248
 
249
+ # Function to compare documents via one-to-many query approach
250
+ def compare_documents_one_to_many(api_key, focus_input, comparison_inputs, input_text, display_placeholder):
251
+ """
252
+ Compares a focus document against multiple comparison documents using a one-to-many query approach.
253
+
254
+ Args:
255
+ api_key (str): OpenAI API key.
256
+ focus_input: Focus document (uploaded file or path to vector store).
257
+ comparison_inputs: List of comparison documents (uploaded files or paths to vector stores).
258
+ input_text (str): The comparison question to ask.
259
+ display_placeholder: Streamlit placeholder for displaying results.
260
+ """
261
+ # Set the OpenAI API key
262
  os.environ["OPENAI_API_KEY"] = api_key
263
 
264
  def load_documents_from_pdf(file):
265
+ """
266
+ Loads documents from a PDF file.
267
+
268
+ Args:
269
+ file: Uploaded PDF file.
270
+
271
+ Returns:
272
+ list: List of documents.
273
+ """
274
  with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
275
  temp_pdf.write(file.read())
276
  temp_pdf_path = temp_pdf.name
 
281
  return docs
282
 
283
  def load_vector_store_from_path(path):
284
+ """
285
+ Loads a vector store from a given path.
286
+
287
+ Args:
288
+ path (str): Path to the vector store.
289
+
290
+ Returns:
291
+ FAISS: Loaded vector store.
292
+ """
293
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
294
+ return FAISS.load_local(
295
+ path,
296
+ embeddings,
297
+ allow_dangerous_deserialization=True
298
+ )
299
 
300
  # Load focus documents or vector store
301
  if isinstance(focus_input, st.runtime.uploaded_file_manager.UploadedFile):
302
+ # If focus_input is an uploaded PDF file
303
  focus_docs = load_documents_from_pdf(focus_input)
304
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
305
  focus_splits = text_splitter.split_documents(focus_docs)
306
+ focus_vector_store = FAISS.from_documents(
307
+ focus_splits,
308
+ OpenAIEmbeddings(model="text-embedding-3-large")
309
+ )
310
  focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
311
  elif isinstance(focus_input, str) and os.path.isdir(focus_input):
312
+ # If focus_input is a path to a vector store
313
  focus_vector_store = load_vector_store_from_path(focus_input)
314
  focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
315
  else:
316
  raise ValueError("Invalid focus input type. Must be a PDF file or a path to a vector store.")
317
 
318
+ # Retrieve relevant chunks from the focus document
319
  focus_docs = focus_retriever.invoke(input_text)
320
 
321
+ # Initialize list to collect comparison chunks
322
  comparison_chunks = []
323
  for comparison_input in comparison_inputs:
324
  if isinstance(comparison_input, st.runtime.uploaded_file_manager.UploadedFile):
325
+ # If comparison_input is an uploaded PDF file
326
  comparison_docs = load_documents_from_pdf(comparison_input)
327
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
328
  comparison_splits = text_splitter.split_documents(comparison_docs)
329
+ comparison_vector_store = FAISS.from_documents(
330
+ comparison_splits,
331
+ OpenAIEmbeddings(model="text-embedding-3-large")
332
+ )
333
  comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
334
  elif isinstance(comparison_input, str) and os.path.isdir(comparison_input):
335
+ # If comparison_input is a path to a vector store
336
  comparison_vector_store = load_vector_store_from_path(comparison_input)
337
  comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
338
  else:
339
  raise ValueError("Invalid comparison input type. Must be a PDF file or a path to a vector store.")
340
 
341
+ # Retrieve relevant chunks from the comparison document
342
  comparison_docs = comparison_retriever.invoke(input_text)
343
  comparison_chunks.extend(comparison_docs)
344
 
345
  # Construct the combined context
346
+ combined_context = focus_docs + comparison_chunks
 
 
 
347
 
348
  # Read the system prompt
349
  prompt_path = "Prompts/comparison_prompt.md"
 
364
  # Create the question-answering chain
365
  llm = ChatOpenAI(model="gpt-4o")
366
  question_answer_chain = create_stuff_documents_chain(
367
+ llm,
368
  prompt,
369
  document_variable_name="context"
370
  )
 
376
  })
377
 
378
  # Display the answer
379
+ answer = result["answer"] if "answer" in result else result
380
+ display_placeholder.markdown(f"**Answer:**\n{answer}")
381
 
382
  # Function to list vector store documents
383
  def list_vector_store_documents():
384
+ """
385
+ Lists available vector store documents.
386
+
387
+ Returns:
388
+ list: List of document names.
389
+ """
390
  # Assuming documents are stored in the "Individual_All_Vectorstores" directory
391
  directory_path = "Individual_All_Vectorstores"
392
  if not os.path.exists(directory_path):
393
+ raise FileNotFoundError(
394
+ f"The directory '{directory_path}' does not exist. "
395
+ "Run `create_and_save_individual_vector_stores()` to create it."
396
+ )
397
  # List all available vector stores by document name
398
+ documents = [
399
+ f.replace("_vectorstore", "").replace("_", " ")
400
+ for f in os.listdir(directory_path)
401
+ if f.endswith("_vectorstore")
402
+ ]
403
  return documents
404
 
405
+ # Function to compare plans using a long context model
406
+ def compare_plans_with_long_context_model(api_key, anthropic_api_key, input_text, focus_plan_path, focus_city_name, selected_summaries, display_placeholder):
407
+ """
408
+ Compares plans using a long context model.
409
+
410
+ Args:
411
+ api_key (str): OpenAI API key.
412
+ anthropic_api_key (str): Anthropic API key.
413
+ input_text (str): The comparison question to ask.
414
+ focus_plan_path (str): Path to the focus plan.
415
+ focus_city_name (str): Name of the focus city.
416
+ selected_summaries (list): List of selected summary documents.
417
+ display_placeholder: Streamlit placeholder for displaying results.
418
+ """
419
+ # Set the API keys
420
  os.environ["OPENAI_API_KEY"] = api_key
421
  os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key
422
 
 
454
  # Display the answer
455
  display_placeholder.markdown(f"**Answer:**\n{message.content}", unsafe_allow_html=True)
456
 
 
457
  # Streamlit app layout with tabs
458
  st.title("Climate Policy Analysis Tool")
459
 
 
461
  api_key = st.text_input("Enter your OpenAI API key:", type="password", key="openai_key")
462
 
463
  # Create tabs
464
+ tab1, tab2, tab3, tab4, tab5 = st.tabs([
465
+ "Summary Generation",
466
+ "Multi-Plan QA (Shared Vectorstore)",
467
+ "Multi-Plan QA (Multi-Vectorstore)",
468
+ "Plan Comparison Tool",
469
+ "Plan Comparison with Long Context Model"
470
+ ])
471
 
472
  # First tab: Summary Generation
473
  with tab1:
474
+ uploaded_file = st.file_uploader(
475
+ "Upload a Climate Action Plan in PDF format",
476
+ type="pdf",
477
+ key="upload_file"
478
+ )
479
 
480
  prompt_file_path = "Prompts/summary_tool_system_prompt.md"
481
  questions_file_path = "Prompts/summary_tool_questions.md"
482
 
483
+ if st.button("Generate", key="generate_button"):
484
+ if not api_key:
485
+ st.warning("Please provide your OpenAI API key.")
486
+ elif not uploaded_file:
487
+ st.warning("Please upload a PDF file.")
488
+ else:
489
+ display_placeholder = st.empty()
490
+ with st.spinner("Processing..."):
491
+ try:
492
+ results = generate_summary_from_pdf(
493
+ api_key,
494
+ uploaded_file,
495
+ questions_file_path,
496
+ prompt_file_path,
497
+ display_placeholder
498
+ )
499
+ markdown_text = "\n".join(results)
500
+
501
+ # Use the uploaded file's name for the download file
502
+ base_name = os.path.splitext(uploaded_file.name)[0]
503
+ download_file_name = f"{base_name}_Summary.md"
504
+
505
+ st.download_button(
506
+ label="Download Results as Markdown",
507
+ data=markdown_text,
508
+ file_name=download_file_name,
509
+ mime="text/markdown",
510
+ key="download_button"
511
+ )
512
+ except Exception as e:
513
+ st.error(f"An error occurred: {e}")
514
+
515
+ # Second tab: Multi-Plan QA (Shared Vectorstore)
516
  with tab2:
517
  input_text = st.text_input("Ask a question:", key="multi_plan_input")
518
+ if st.button("Ask", key="multi_plan_qa_button"):
519
+ if not api_key:
520
+ st.warning("Please provide your OpenAI API key.")
521
+ elif not input_text:
522
+ st.warning("Please enter a question.")
523
+ else:
524
+ display_placeholder2 = st.empty()
525
+ with st.spinner("Processing..."):
526
+ try:
527
+ perform_multi_plan_qa_shared_vectorstore(
528
+ api_key,
529
+ input_text,
530
+ display_placeholder2
531
+ )
532
+ except Exception as e:
533
+ st.error(f"An error occurred: {e}")
534
+
535
+ # Third tab: Multi-Plan QA (Multi-Vectorstore)
536
  with tab3:
537
  user_input = st.text_input("Ask a Question", key="multi_vectorstore_input")
538
+ if st.button("Ask", key="multi_vectorstore_qa_button"):
539
+ if not api_key:
540
+ st.warning("Please provide your OpenAI API key.")
541
+ elif not user_input:
542
+ st.warning("Please enter a question.")
543
+ else:
544
+ display_placeholder3 = st.empty()
545
+ with st.spinner("Processing..."):
546
+ try:
547
+ perform_multi_plan_qa_multi_vectorstore(
548
+ api_key,
549
+ user_input,
550
+ display_placeholder3
551
+ )
552
+ except Exception as e:
553
+ st.error(f"An error occurred: {e}")
554
 
555
  # Fourth tab: Plan Comparison Tool
556
  with tab4:
 
560
  vectorstore_documents = list_vector_store_documents()
561
 
562
  # Option to upload a new plan or select from existing vector stores
563
+ focus_option = st.radio(
564
+ "Choose a focus plan:",
565
+ ("Select from existing vector stores", "Upload a new plan"),
566
+ key="focus_option"
567
+ )
568
 
569
  if focus_option == "Upload a new plan":
570
+ focus_uploaded_file = st.file_uploader(
571
+ "Upload a Climate Action Plan to compare",
572
+ type="pdf",
573
+ key="focus_upload"
574
+ )
575
+ focus_city_name = st.text_input(
576
+ "Enter the city name for the uploaded plan:",
577
+ key="focus_city_name"
578
+ )
579
  if focus_uploaded_file is not None and focus_city_name:
580
  # Directly use the uploaded file
581
  focus_input = focus_uploaded_file
 
583
  focus_input = None
584
  else:
585
  # Select a focus plan from existing vector stores
586
+ selected_focus_plan = st.selectbox(
587
+ "Select a focus plan:",
588
+ vectorstore_documents,
589
+ key="select_focus_plan"
590
+ )
591
+ focus_input = os.path.join(
592
+ "Individual_All_Vectorstores",
593
+ f"{selected_focus_plan}_vectorstore"
594
+ )
595
  focus_city_name = selected_focus_plan.replace("_", " ")
596
 
597
  # Option to upload comparison documents or select from existing vector stores
598
+ comparison_option = st.radio(
599
+ "Choose comparison documents:",
600
+ ("Select from existing vector stores", "Upload new documents"),
601
+ key="comparison_option"
602
+ )
603
 
604
  if comparison_option == "Upload new documents":
605
+ comparison_files = st.file_uploader(
606
+ "Upload comparison documents",
607
+ type="pdf",
608
+ accept_multiple_files=True,
609
+ key="comparison_files"
610
+ )
611
  comparison_inputs = comparison_files
612
  else:
613
  # Select comparison documents from existing vector stores
614
+ selected_comparison_plans = st.multiselect(
615
+ "Select comparison documents:",
616
+ vectorstore_documents,
617
+ key="select_comparison_plans"
618
+ )
619
+ comparison_inputs = [
620
+ os.path.join(
621
+ "Individual_All_Vectorstores",
622
+ f"{doc}_vectorstore"
623
+ ) for doc in selected_comparison_plans
624
+ ]
625
 
626
+ input_text = st.text_input(
627
+ "Ask a comparison question:",
628
+ key="comparison_input"
629
+ )
630
 
631
+ if st.button("Compare", key="compare_button"):
632
+ if not api_key:
633
+ st.warning("Please provide your OpenAI API key.")
634
+ elif not input_text:
635
+ st.warning("Please enter a comparison question.")
636
+ elif not focus_input:
637
+ st.warning("Please provide a focus plan.")
638
+ elif not comparison_inputs:
639
+ st.warning("Please provide comparison documents.")
640
+ else:
641
+ display_placeholder4 = st.empty()
642
+ with st.spinner("Processing..."):
643
+ try:
644
+ compare_documents_one_to_many(
645
+ api_key,
646
+ focus_input,
647
+ comparison_inputs,
648
+ input_text,
649
+ display_placeholder4
650
+ )
651
+ except Exception as e:
652
+ st.error(f"An error occurred: {e}")
653
 
654
  # Fifth tab: Plan Comparison with Long Context Model
655
  with tab5:
656
  st.header("Plan Comparison with Long Context Model")
657
 
658
  # Anthropics API Key Input
659
+ anthropic_api_key = st.text_input(
660
+ "Enter your Anthropic API key:",
661
+ type="password",
662
+ key="anthropic_key"
663
+ )
664
 
665
  # Option to upload a new plan or select from a list
666
+ upload_option = st.radio(
667
+ "Choose a focus plan:",
668
+ ("Select from existing plans", "Upload a new plan"),
669
+ key="upload_option_long_context"
670
+ )
671
 
672
  if upload_option == "Upload a new plan":
673
+ focus_uploaded_file = st.file_uploader(
674
+ "Upload a Climate Action Plan to compare",
675
+ type="pdf",
676
+ key="focus_upload_long_context"
677
+ )
678
+ focus_city_name = st.text_input(
679
+ "Enter the city name for the uploaded plan:",
680
+ key="focus_city_name_long_context"
681
+ )
682
  if focus_uploaded_file is not None and focus_city_name:
683
  # Save uploaded file temporarily
684
  with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
 
689
  else:
690
  # List of existing plans in CAPS
691
  plan_list = [f.replace(".pdf", "") for f in os.listdir("CAPS") if f.endswith('.pdf')]
692
+ selected_plan = st.selectbox(
693
+ "Select a plan:",
694
+ plan_list,
695
+ key="selected_plan_long_context"
696
+ )
697
  focus_plan_path = os.path.join("CAPS", selected_plan)
698
  # Extract city name from the file name
699
  focus_city_name = os.path.splitext(selected_plan)[0].replace("_", " ")
700
 
701
  # List available summary documents for selection
702
  summaries_directory = "CAPS_Summaries"
703
+ summary_files = [
704
+ f.replace(".md", "").replace("_", " ")
705
+ for f in os.listdir(summaries_directory) if f.endswith('.md')
706
+ ]
707
+ selected_summaries = st.multiselect(
708
+ "Select summary documents for comparison:",
709
+ summary_files,
710
+ key="selected_summaries"
711
+ )
712
+
713
+ input_text = st.text_input(
714
+ "Ask a comparison question:",
715
+ key="comparison_input_long_context"
716
+ )
717
+
718
+ if st.button("Compare with Long Context", key="compare_button_long_context"):
719
+ if not api_key:
720
+ st.warning("Please provide your OpenAI API key.")
721
+ elif not anthropic_api_key:
722
+ st.warning("Please provide your Anthropic API key.")
723
+ elif not input_text:
724
+ st.warning("Please enter a comparison question.")
725
+ elif not focus_plan_path:
726
+ st.warning("Please provide a focus plan.")
727
+ elif not focus_city_name:
728
+ st.warning("Please enter the city name for the focus plan.")
729
+ else:
730
+ display_placeholder = st.empty()
731
+ with st.spinner("Processing..."):
732
+ try:
733
+ compare_plans_with_long_context_model(
734
+ api_key,
735
+ anthropic_api_key,
736
+ input_text,
737
+ focus_plan_path,
738
+ focus_city_name,
739
+ selected_summaries,
740
+ display_placeholder
741
+ )
742
+ except Exception as e:
743
+ st.error(f"An error occurred: {e}")