umangchaudhry commited on
Commit
8af57c5
·
verified ·
1 Parent(s): 832b728

fixed errors

Browse files
Files changed (1) hide show
  1. app.py +96 -342
app.py CHANGED
@@ -1,29 +1,20 @@
1
  import os
2
- import re
3
  import streamlit as st
4
  from tempfile import NamedTemporaryFile
5
- import anthropic
6
-
7
- # Import necessary modules from LangChain
8
  from langchain.chains import create_retrieval_chain
9
  from langchain.chains.combine_documents import create_stuff_documents_chain
10
  from langchain_core.prompts import ChatPromptTemplate
11
- from langchain_openai import ChatOpenAI, OpenAIEmbeddings
12
- from langchain_community.document_loaders import PyPDFLoader, TextLoader
 
13
  from langchain_community.vectorstores import FAISS
 
14
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
 
15
 
16
  # Function to remove code block markers from the answer
17
  def remove_code_blocks(text):
18
- """
19
- Removes code block markers from the answer text.
20
-
21
- Args:
22
- text (str): The text from which code block markers should be removed.
23
-
24
- Returns:
25
- str: The text without code block markers.
26
- """
27
  code_block_pattern = r"^```(?:\w+)?\n(.*?)\n```$"
28
  match = re.match(code_block_pattern, text, re.DOTALL)
29
  if match:
@@ -32,49 +23,30 @@ def remove_code_blocks(text):
32
  return text
33
 
34
  # Function to process PDF, run Q&A, and return results
35
- def generate_summary_from_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder):
36
- """
37
- Processes a PDF file, runs Q&A, and returns the results.
38
-
39
- Args:
40
- api_key (str): OpenAI API key.
41
- uploaded_file: Uploaded PDF file.
42
- questions_path (str): Path to the questions file.
43
- prompt_path (str): Path to the system prompt file.
44
- display_placeholder: Streamlit placeholder for displaying results.
45
-
46
- Returns:
47
- list: List of QA results.
48
- """
49
- # Set the OpenAI API key
50
  os.environ["OPENAI_API_KEY"] = api_key
51
 
52
- # Save the uploaded PDF to a temporary file
53
  with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
54
  temp_pdf.write(uploaded_file.read())
55
  temp_pdf_path = temp_pdf.name
56
 
57
- # Load and split the PDF into documents
58
  loader = PyPDFLoader(temp_pdf_path)
59
  docs = loader.load()
 
60
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
61
  splits = text_splitter.split_documents(docs)
62
 
63
- # Create a vector store from the documents
64
  vectorstore = FAISS.from_documents(
65
- documents=splits,
66
- embedding=OpenAIEmbeddings(model="text-embedding-3-large")
67
  )
68
  retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
69
 
70
- # Load the system prompt
71
  if os.path.exists(prompt_path):
72
  with open(prompt_path, "r") as file:
73
  system_prompt = file.read()
74
  else:
75
  raise FileNotFoundError(f"The specified file was not found: {prompt_path}")
76
 
77
- # Create the prompt template
78
  prompt = ChatPromptTemplate.from_messages(
79
  [
80
  ("system", system_prompt),
@@ -82,60 +54,38 @@ def generate_summary_from_pdf(api_key, uploaded_file, questions_path, prompt_pat
82
  ]
83
  )
84
 
85
- # Initialize the language model
86
  llm = ChatOpenAI(model="gpt-4o")
87
-
88
- # Create the question-answering chain
89
- question_answer_chain = create_stuff_documents_chain(
90
- llm, prompt, document_variable_name="context"
91
- )
92
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
93
 
94
- # Load the questions
95
  if os.path.exists(questions_path):
96
  with open(questions_path, "r") as file:
97
  questions = [line.strip() for line in file.readlines() if line.strip()]
98
  else:
99
  raise FileNotFoundError(f"The specified file was not found: {questions_path}")
100
 
101
- # Process each question
102
  qa_results = []
103
  for question in questions:
104
  result = rag_chain.invoke({"input": question})
105
  answer = result["answer"]
106
 
107
- # Remove code block markers
108
  answer = remove_code_blocks(answer)
109
 
110
  qa_text = f"### Question: {question}\n**Answer:**\n{answer}\n"
111
  qa_results.append(qa_text)
112
  display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True)
113
 
114
- # Clean up temporary PDF file
115
  os.remove(temp_pdf_path)
116
 
117
  return qa_results
118
 
119
- # Function to perform multi-plan QA using an existing shared vector store
120
- def perform_multi_plan_qa_shared_vectorstore(api_key, input_text, display_placeholder):
121
- """
122
- Performs multi-plan QA using an existing shared vector store.
123
-
124
- Args:
125
- api_key (str): OpenAI API key.
126
- input_text (str): The question to ask.
127
- display_placeholder: Streamlit placeholder for displaying results.
128
- """
129
- # Set the OpenAI API key
130
  os.environ["OPENAI_API_KEY"] = api_key
131
 
132
  # Load the existing vector store
133
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
134
- vector_store = FAISS.load_local(
135
- "Combined_Summary_Vectorstore",
136
- embeddings,
137
- allow_dangerous_deserialization=True
138
- )
139
 
140
  # Convert the vector store to a retriever
141
  retriever = vector_store.as_retriever(search_kwargs={"k": 50})
@@ -158,9 +108,7 @@ def perform_multi_plan_qa_shared_vectorstore(api_key, input_text, display_placeh
158
 
159
  # Create the question-answering chain
160
  llm = ChatOpenAI(model="gpt-4o")
161
- question_answer_chain = create_stuff_documents_chain(
162
- llm, prompt, document_variable_name="context"
163
- )
164
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
165
 
166
  # Process the input text
@@ -170,27 +118,14 @@ def perform_multi_plan_qa_shared_vectorstore(api_key, input_text, display_placeh
170
  # Display the answer
171
  display_placeholder.markdown(f"**Answer:**\n{answer}")
172
 
173
- # Function to perform multi-plan QA using multiple individual vector stores
174
- def perform_multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
175
- """
176
- Performs multi-plan QA using multiple individual vector stores.
177
-
178
- Args:
179
- api_key (str): OpenAI API key.
180
- input_text (str): The question to ask.
181
- display_placeholder: Streamlit placeholder for displaying results.
182
- """
183
- # Set the OpenAI API key
184
  os.environ["OPENAI_API_KEY"] = api_key
185
 
186
  # Directory containing individual vector stores
187
  vectorstore_directory = "Individual_Summary_Vectorstores"
188
 
189
  # List all vector store directories
190
- vectorstore_names = [
191
- d for d in os.listdir(vectorstore_directory)
192
- if os.path.isdir(os.path.join(vectorstore_directory, d))
193
- ]
194
 
195
  # Initialize a list to collect all retrieved chunks
196
  all_retrieved_chunks = []
@@ -201,17 +136,13 @@ def perform_multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeho
201
 
202
  # Load the vector store
203
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
204
- vector_store = FAISS.load_local(
205
- vectorstore_path,
206
- embeddings,
207
- allow_dangerous_deserialization=True
208
- )
209
 
210
  # Convert the vector store to a retriever
211
  retriever = vector_store.as_retriever(search_kwargs={"k": 2})
212
 
213
  # Retrieve relevant chunks for the input text
214
- retrieved_chunks = retriever.invoke(input_text)
215
  all_retrieved_chunks.extend(retrieved_chunks)
216
 
217
  # Read the system prompt for multi-document QA
@@ -232,118 +163,75 @@ def perform_multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeho
232
 
233
  # Create the question-answering chain
234
  llm = ChatOpenAI(model="gpt-4o")
235
- question_answer_chain = create_stuff_documents_chain(
236
- llm, prompt, document_variable_name="context"
237
- )
238
 
239
  # Process the combined context
240
- result = question_answer_chain.invoke({
241
- "input": input_text,
242
- "context": all_retrieved_chunks
243
- })
244
 
245
  # Display the answer
246
- answer = result["answer"] if "answer" in result else result
247
- display_placeholder.markdown(f"**Answer:**\n{answer}")
248
-
249
- # Function to compare documents via one-to-many query approach
250
- def compare_documents_one_to_many(api_key, focus_input, comparison_inputs, input_text, display_placeholder):
251
- """
252
- Compares a focus document against multiple comparison documents using a one-to-many query approach.
253
-
254
- Args:
255
- api_key (str): OpenAI API key.
256
- focus_input: Focus document (uploaded file or path to vector store).
257
- comparison_inputs: List of comparison documents (uploaded files or paths to vector stores).
258
- input_text (str): The comparison question to ask.
259
- display_placeholder: Streamlit placeholder for displaying results.
260
- """
261
- # Set the OpenAI API key
262
- os.environ["OPENAI_API_KEY"] = api_key
263
-
264
- def load_documents_from_pdf(file):
265
- """
266
- Loads documents from a PDF file.
267
 
268
- Args:
269
- file: Uploaded PDF file.
270
 
271
- Returns:
272
- list: List of documents.
273
- """
274
- with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
275
- temp_pdf.write(file.read())
276
- temp_pdf_path = temp_pdf.name
277
 
278
- loader = PyPDFLoader(temp_pdf_path)
279
- docs = loader.load()
280
- os.remove(temp_pdf_path)
281
- return docs
282
 
283
- def load_vector_store_from_path(path):
284
- """
285
- Loads a vector store from a given path.
 
286
 
287
- Args:
288
- path (str): Path to the vector store.
 
289
 
290
- Returns:
291
- FAISS: Loaded vector store.
292
- """
293
- embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
294
- return FAISS.load_local(
295
- path,
296
- embeddings,
297
- allow_dangerous_deserialization=True
298
- )
299
 
 
 
 
 
300
  # Load focus documents or vector store
301
  if isinstance(focus_input, st.runtime.uploaded_file_manager.UploadedFile):
302
- # If focus_input is an uploaded PDF file
303
  focus_docs = load_documents_from_pdf(focus_input)
304
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
305
  focus_splits = text_splitter.split_documents(focus_docs)
306
- focus_vector_store = FAISS.from_documents(
307
- focus_splits,
308
- OpenAIEmbeddings(model="text-embedding-3-large")
309
- )
310
  focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
311
  elif isinstance(focus_input, str) and os.path.isdir(focus_input):
312
- # If focus_input is a path to a vector store
313
  focus_vector_store = load_vector_store_from_path(focus_input)
314
  focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
315
  else:
316
  raise ValueError("Invalid focus input type. Must be a PDF file or a path to a vector store.")
317
 
318
- # Retrieve relevant chunks from the focus document
319
  focus_docs = focus_retriever.invoke(input_text)
320
 
321
- # Initialize list to collect comparison chunks
322
  comparison_chunks = []
323
  for comparison_input in comparison_inputs:
324
  if isinstance(comparison_input, st.runtime.uploaded_file_manager.UploadedFile):
325
- # If comparison_input is an uploaded PDF file
326
  comparison_docs = load_documents_from_pdf(comparison_input)
327
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
328
  comparison_splits = text_splitter.split_documents(comparison_docs)
329
- comparison_vector_store = FAISS.from_documents(
330
- comparison_splits,
331
- OpenAIEmbeddings(model="text-embedding-3-large")
332
- )
333
  comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
334
  elif isinstance(comparison_input, str) and os.path.isdir(comparison_input):
335
- # If comparison_input is a path to a vector store
336
  comparison_vector_store = load_vector_store_from_path(comparison_input)
337
  comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
338
  else:
339
  raise ValueError("Invalid comparison input type. Must be a PDF file or a path to a vector store.")
340
 
341
- # Retrieve relevant chunks from the comparison document
342
  comparison_docs = comparison_retriever.invoke(input_text)
343
  comparison_chunks.extend(comparison_docs)
344
 
345
  # Construct the combined context
346
- combined_context = focus_docs + comparison_chunks
 
 
 
347
 
348
  # Read the system prompt
349
  prompt_path = "Prompts/comparison_prompt.md"
@@ -364,7 +252,7 @@ def compare_documents_one_to_many(api_key, focus_input, comparison_inputs, input
364
  # Create the question-answering chain
365
  llm = ChatOpenAI(model="gpt-4o")
366
  question_answer_chain = create_stuff_documents_chain(
367
- llm,
368
  prompt,
369
  document_variable_name="context"
370
  )
@@ -376,66 +264,35 @@ def compare_documents_one_to_many(api_key, focus_input, comparison_inputs, input
376
  })
377
 
378
  # Display the answer
379
- answer = result["answer"] if "answer" in result else result
380
- display_placeholder.markdown(f"**Answer:**\n{answer}")
381
 
382
  # Function to list vector store documents
383
  def list_vector_store_documents():
384
- """
385
- Lists available vector store documents.
386
-
387
- Returns:
388
- list: List of document names.
389
- """
390
  # Assuming documents are stored in the "Individual_All_Vectorstores" directory
391
  directory_path = "Individual_All_Vectorstores"
392
  if not os.path.exists(directory_path):
393
- raise FileNotFoundError(
394
- f"The directory '{directory_path}' does not exist. "
395
- "Run `create_and_save_individual_vector_stores()` to create it."
396
- )
397
  # List all available vector stores by document name
398
- documents = [
399
- f.replace("_vectorstore", "").replace("_", " ")
400
- for f in os.listdir(directory_path)
401
- if f.endswith("_vectorstore")
402
- ]
403
  return documents
404
 
405
- # Function to compare plans using a long context model
406
- def compare_plans_with_long_context_model(api_key, anthropic_api_key, input_text, focus_plan_path, focus_city_name, selected_summaries, display_placeholder):
407
- """
408
- Compares plans using a long context model.
409
-
410
- Args:
411
- api_key (str): OpenAI API key.
412
- anthropic_api_key (str): Anthropic API key.
413
- input_text (str): The comparison question to ask.
414
- focus_plan_path (str): Path to the focus plan.
415
- focus_city_name (str): Name of the focus city.
416
- selected_summaries (list): List of selected summary documents.
417
- display_placeholder: Streamlit placeholder for displaying results.
418
- """
419
- # Set the API keys
420
  os.environ["OPENAI_API_KEY"] = api_key
421
  os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key
422
-
423
  # Load the focus plan
424
- focus_docs = []
425
- if focus_plan_path.endswith('.pdf'):
 
 
 
426
  focus_loader = PyPDFLoader(focus_plan_path)
427
  focus_docs = focus_loader.load()
428
- elif focus_plan_path.endswith('.md'):
429
- focus_loader = TextLoader(focus_plan_path)
430
- focus_docs = focus_loader.load()
431
- else:
432
- raise ValueError("Unsupported file format for focus plan.")
433
 
434
  # Concatenate selected summary documents
435
  summaries_directory = "CAPS_Summaries"
436
  summaries_content = ""
437
  for filename in selected_summaries:
438
- with open(os.path.join(summaries_directory, filename), 'r') as file:
439
  summaries_content += file.read() + "\n\n"
440
 
441
  # Prepare the context
@@ -454,6 +311,7 @@ def compare_plans_with_long_context_model(api_key, anthropic_api_key, input_text
454
  # Display the answer
455
  display_placeholder.markdown(f"**Answer:**\n{message.content}", unsafe_allow_html=True)
456
 
 
457
  # Streamlit app layout with tabs
458
  st.title("Climate Policy Analysis Tool")
459
 
@@ -461,21 +319,11 @@ st.title("Climate Policy Analysis Tool")
461
  api_key = st.text_input("Enter your OpenAI API key:", type="password", key="openai_key")
462
 
463
  # Create tabs
464
- tab1, tab2, tab3, tab4, tab5 = st.tabs([
465
- "Summary Generation",
466
- "Multi-Plan QA (Shared Vectorstore)",
467
- "Multi-Plan QA (Multi-Vectorstore)",
468
- "Plan Comparison Tool",
469
- "Plan Comparison with Long Context Model"
470
- ])
471
 
472
  # First tab: Summary Generation
473
  with tab1:
474
- uploaded_file = st.file_uploader(
475
- "Upload a Climate Action Plan in PDF format",
476
- type="pdf",
477
- key="upload_file"
478
- )
479
 
480
  prompt_file_path = "Prompts/summary_tool_system_prompt.md"
481
  questions_file_path = "Prompts/summary_tool_questions.md"
@@ -489,19 +337,14 @@ with tab1:
489
  display_placeholder = st.empty()
490
  with st.spinner("Processing..."):
491
  try:
492
- results = generate_summary_from_pdf(
493
- api_key,
494
- uploaded_file,
495
- questions_file_path,
496
- prompt_file_path,
497
- display_placeholder
498
- )
499
  markdown_text = "\n".join(results)
500
-
501
  # Use the uploaded file's name for the download file
502
  base_name = os.path.splitext(uploaded_file.name)[0]
503
  download_file_name = f"{base_name}_Summary.md"
504
-
505
  st.download_button(
506
  label="Download Results as Markdown",
507
  data=markdown_text,
@@ -512,7 +355,7 @@ with tab1:
512
  except Exception as e:
513
  st.error(f"An error occurred: {e}")
514
 
515
- # Second tab: Multi-Plan QA (Shared Vectorstore)
516
  with tab2:
517
  input_text = st.text_input("Ask a question:", key="multi_plan_input")
518
  if st.button("Ask", key="multi_plan_qa_button"):
@@ -524,7 +367,7 @@ with tab2:
524
  display_placeholder2 = st.empty()
525
  with st.spinner("Processing..."):
526
  try:
527
- perform_multi_plan_qa_shared_vectorstore(
528
  api_key,
529
  input_text,
530
  display_placeholder2
@@ -532,9 +375,9 @@ with tab2:
532
  except Exception as e:
533
  st.error(f"An error occurred: {e}")
534
 
535
- # Third tab: Multi-Plan QA (Multi-Vectorstore)
536
  with tab3:
537
- user_input = st.text_input("Ask a Question", key="multi_vectorstore_input")
538
  if st.button("Ask", key="multi_vectorstore_qa_button"):
539
  if not api_key:
540
  st.warning("Please provide your OpenAI API key.")
@@ -544,7 +387,7 @@ with tab3:
544
  display_placeholder3 = st.empty()
545
  with st.spinner("Processing..."):
546
  try:
547
- perform_multi_plan_qa_multi_vectorstore(
548
  api_key,
549
  user_input,
550
  display_placeholder3
@@ -560,73 +403,32 @@ with tab4:
560
  vectorstore_documents = list_vector_store_documents()
561
 
562
  # Option to upload a new plan or select from existing vector stores
563
- focus_option = st.radio(
564
- "Choose a focus plan:",
565
- ("Select from existing vector stores", "Upload a new plan"),
566
- key="focus_option"
567
- )
568
 
569
  if focus_option == "Upload a new plan":
570
- focus_uploaded_file = st.file_uploader(
571
- "Upload a Climate Action Plan to compare",
572
- type="pdf",
573
- key="focus_upload"
574
- )
575
- focus_city_name = st.text_input(
576
- "Enter the city name for the uploaded plan:",
577
- key="focus_city_name"
578
- )
579
- if focus_uploaded_file is not None and focus_city_name:
580
  # Directly use the uploaded file
581
  focus_input = focus_uploaded_file
582
  else:
583
  focus_input = None
584
  else:
585
  # Select a focus plan from existing vector stores
586
- selected_focus_plan = st.selectbox(
587
- "Select a focus plan:",
588
- vectorstore_documents,
589
- key="select_focus_plan"
590
- )
591
- focus_input = os.path.join(
592
- "Individual_All_Vectorstores",
593
- f"{selected_focus_plan}_vectorstore"
594
- )
595
- focus_city_name = selected_focus_plan.replace("_", " ")
596
 
597
  # Option to upload comparison documents or select from existing vector stores
598
- comparison_option = st.radio(
599
- "Choose comparison documents:",
600
- ("Select from existing vector stores", "Upload new documents"),
601
- key="comparison_option"
602
- )
603
 
604
  if comparison_option == "Upload new documents":
605
- comparison_files = st.file_uploader(
606
- "Upload comparison documents",
607
- type="pdf",
608
- accept_multiple_files=True,
609
- key="comparison_files"
610
- )
611
  comparison_inputs = comparison_files
612
  else:
613
  # Select comparison documents from existing vector stores
614
- selected_comparison_plans = st.multiselect(
615
- "Select comparison documents:",
616
- vectorstore_documents,
617
- key="select_comparison_plans"
618
- )
619
- comparison_inputs = [
620
- os.path.join(
621
- "Individual_All_Vectorstores",
622
- f"{doc}_vectorstore"
623
- ) for doc in selected_comparison_plans
624
- ]
625
 
626
- input_text = st.text_input(
627
- "Ask a comparison question:",
628
- key="comparison_input"
629
- )
630
 
631
  if st.button("Compare", key="compare_button"):
632
  if not api_key:
@@ -641,13 +443,9 @@ with tab4:
641
  display_placeholder4 = st.empty()
642
  with st.spinner("Processing..."):
643
  try:
644
- compare_documents_one_to_many(
645
- api_key,
646
- focus_input,
647
- comparison_inputs,
648
- input_text,
649
- display_placeholder4
650
- )
651
  except Exception as e:
652
  st.error(f"An error occurred: {e}")
653
 
@@ -656,64 +454,30 @@ with tab5:
656
  st.header("Plan Comparison with Long Context Model")
657
 
658
  # Anthropics API Key Input
659
- anthropic_api_key = st.text_input(
660
- "Enter your Anthropic API key:",
661
- type="password",
662
- key="anthropic_key"
663
- )
664
 
665
  # Option to upload a new plan or select from a list
666
- upload_option = st.radio(
667
- "Choose a focus plan:",
668
- ("Select from existing plans", "Upload a new plan"),
669
- key="upload_option_long_context"
670
- )
671
 
672
- if upload_option == "Upload a new plan":
673
- focus_uploaded_file = st.file_uploader(
674
- "Upload a Climate Action Plan to compare",
675
- type="pdf",
676
- key="focus_upload_long_context"
677
- )
678
- focus_city_name = st.text_input(
679
- "Enter the city name for the uploaded plan:",
680
- key="focus_city_name_long_context"
681
- )
682
- if focus_uploaded_file is not None and focus_city_name:
683
- # Save uploaded file temporarily
684
- with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
685
- temp_pdf.write(focus_uploaded_file.read())
686
- focus_plan_path = temp_pdf.name
687
  else:
688
  focus_plan_path = None
689
  else:
690
- # List of existing plans in CAPS
691
  plan_list = [f.replace(".pdf", "") for f in os.listdir("CAPS") if f.endswith('.pdf')]
692
- selected_plan = st.selectbox(
693
- "Select a plan:",
694
- plan_list,
695
- key="selected_plan_long_context"
696
- )
697
- focus_plan_path = os.path.join("CAPS", selected_plan)
698
- # Extract city name from the file name
699
- focus_city_name = os.path.splitext(selected_plan)[0].replace("_", " ")
700
 
701
  # List available summary documents for selection
702
  summaries_directory = "CAPS_Summaries"
703
- summary_files = [
704
- f.replace(".md", "").replace("_", " ")
705
- for f in os.listdir(summaries_directory) if f.endswith('.md')
706
- ]
707
- selected_summaries = st.multiselect(
708
- "Select summary documents for comparison:",
709
- summary_files,
710
- key="selected_summaries"
711
- )
712
 
713
- input_text = st.text_input(
714
- "Ask a comparison question:",
715
- key="comparison_input_long_context"
716
- )
717
 
718
  if st.button("Compare with Long Context", key="compare_button_long_context"):
719
  if not api_key:
@@ -724,20 +488,10 @@ with tab5:
724
  st.warning("Please enter a comparison question.")
725
  elif not focus_plan_path:
726
  st.warning("Please provide a focus plan.")
727
- elif not focus_city_name:
728
- st.warning("Please enter the city name for the focus plan.")
729
  else:
730
  display_placeholder = st.empty()
731
  with st.spinner("Processing..."):
732
  try:
733
- compare_plans_with_long_context_model(
734
- api_key,
735
- anthropic_api_key,
736
- input_text,
737
- focus_plan_path,
738
- focus_city_name,
739
- selected_summaries,
740
- display_placeholder
741
- )
742
  except Exception as e:
743
- st.error(f"An error occurred: {e}")
 
1
  import os
 
2
  import streamlit as st
3
  from tempfile import NamedTemporaryFile
 
 
 
4
  from langchain.chains import create_retrieval_chain
5
  from langchain.chains.combine_documents import create_stuff_documents_chain
6
  from langchain_core.prompts import ChatPromptTemplate
7
+ from langchain_openai import ChatOpenAI
8
+ from langchain_community.document_loaders import PyPDFLoader
9
+ from langchain_community.document_loaders import TextLoader
10
  from langchain_community.vectorstores import FAISS
11
+ from langchain_openai import OpenAIEmbeddings
12
  from langchain_text_splitters import RecursiveCharacterTextSplitter
13
+ import re
14
+ import anthropic
15
 
16
  # Function to remove code block markers from the answer
17
  def remove_code_blocks(text):
 
 
 
 
 
 
 
 
 
18
  code_block_pattern = r"^```(?:\w+)?\n(.*?)\n```$"
19
  match = re.match(code_block_pattern, text, re.DOTALL)
20
  if match:
 
23
  return text
24
 
25
  # Function to process PDF, run Q&A, and return results
26
+ def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  os.environ["OPENAI_API_KEY"] = api_key
28
 
 
29
  with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
30
  temp_pdf.write(uploaded_file.read())
31
  temp_pdf_path = temp_pdf.name
32
 
 
33
  loader = PyPDFLoader(temp_pdf_path)
34
  docs = loader.load()
35
+
36
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
37
  splits = text_splitter.split_documents(docs)
38
 
 
39
  vectorstore = FAISS.from_documents(
40
+ documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-large")
 
41
  )
42
  retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
43
 
 
44
  if os.path.exists(prompt_path):
45
  with open(prompt_path, "r") as file:
46
  system_prompt = file.read()
47
  else:
48
  raise FileNotFoundError(f"The specified file was not found: {prompt_path}")
49
 
 
50
  prompt = ChatPromptTemplate.from_messages(
51
  [
52
  ("system", system_prompt),
 
54
  ]
55
  )
56
 
 
57
  llm = ChatOpenAI(model="gpt-4o")
58
+ question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
 
 
 
 
59
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
60
 
 
61
  if os.path.exists(questions_path):
62
  with open(questions_path, "r") as file:
63
  questions = [line.strip() for line in file.readlines() if line.strip()]
64
  else:
65
  raise FileNotFoundError(f"The specified file was not found: {questions_path}")
66
 
 
67
  qa_results = []
68
  for question in questions:
69
  result = rag_chain.invoke({"input": question})
70
  answer = result["answer"]
71
 
 
72
  answer = remove_code_blocks(answer)
73
 
74
  qa_text = f"### Question: {question}\n**Answer:**\n{answer}\n"
75
  qa_results.append(qa_text)
76
  display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True)
77
 
 
78
  os.remove(temp_pdf_path)
79
 
80
  return qa_results
81
 
82
+ # New function to process multi-plan QA using an existing vector store
83
+ def process_multi_plan_qa(api_key, input_text, display_placeholder):
 
 
 
 
 
 
 
 
 
84
  os.environ["OPENAI_API_KEY"] = api_key
85
 
86
  # Load the existing vector store
87
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
88
+ vector_store = FAISS.load_local("Combined_Summary_Vectorstore", embeddings, allow_dangerous_deserialization=True)
 
 
 
 
89
 
90
  # Convert the vector store to a retriever
91
  retriever = vector_store.as_retriever(search_kwargs={"k": 50})
 
108
 
109
  # Create the question-answering chain
110
  llm = ChatOpenAI(model="gpt-4o")
111
+ question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
 
 
112
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
113
 
114
  # Process the input text
 
118
  # Display the answer
119
  display_placeholder.markdown(f"**Answer:**\n{answer}")
120
 
121
+ def multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
 
 
 
 
 
 
 
 
 
 
122
  os.environ["OPENAI_API_KEY"] = api_key
123
 
124
  # Directory containing individual vector stores
125
  vectorstore_directory = "Individual_Summary_Vectorstores"
126
 
127
  # List all vector store directories
128
+ vectorstore_names = [d for d in os.listdir(vectorstore_directory) if os.path.isdir(os.path.join(vectorstore_directory, d))]
 
 
 
129
 
130
  # Initialize a list to collect all retrieved chunks
131
  all_retrieved_chunks = []
 
136
 
137
  # Load the vector store
138
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
139
+ vector_store = FAISS.load_local(vectorstore_path, embeddings, allow_dangerous_deserialization=True)
 
 
 
 
140
 
141
  # Convert the vector store to a retriever
142
  retriever = vector_store.as_retriever(search_kwargs={"k": 2})
143
 
144
  # Retrieve relevant chunks for the input text
145
+ retrieved_chunks = retriever.invoke("input_text")
146
  all_retrieved_chunks.extend(retrieved_chunks)
147
 
148
  # Read the system prompt for multi-document QA
 
163
 
164
  # Create the question-answering chain
165
  llm = ChatOpenAI(model="gpt-4o")
166
+ question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
 
 
167
 
168
  # Process the combined context
169
+ result = question_answer_chain.invoke({"input": input_text, "context": all_retrieved_chunks})
 
 
 
170
 
171
  # Display the answer
172
+ display_placeholder.markdown(f"**Answer:**\n{result}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
 
 
174
 
175
+ def load_documents_from_pdf(file):
176
+ # Check if the file is a PDF
177
+ if not file.name.endswith('.pdf'):
178
+ raise ValueError("The uploaded file is not a PDF. Please upload a PDF file.")
 
 
179
 
180
+ with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
181
+ temp_pdf.write(file.read())
182
+ temp_pdf_path = temp_pdf.name
 
183
 
184
+ loader = PyPDFLoader(temp_pdf_path)
185
+ docs = loader.load()
186
+ os.remove(temp_pdf_path)
187
+ return docs
188
 
189
+ def load_vector_store_from_path(path):
190
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
191
+ return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
192
 
 
 
 
 
 
 
 
 
 
193
 
194
+ # Function to compare document via one-to-many query approach
195
+ def process_one_to_many_query(api_key, focus_input, comparison_inputs, input_text, display_placeholder):
196
+ os.environ["OPENAI_API_KEY"] = api_key
197
+ print(comparison_inputs)
198
  # Load focus documents or vector store
199
  if isinstance(focus_input, st.runtime.uploaded_file_manager.UploadedFile):
 
200
  focus_docs = load_documents_from_pdf(focus_input)
201
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
202
  focus_splits = text_splitter.split_documents(focus_docs)
203
+ focus_vector_store = FAISS.from_documents(focus_splits, OpenAIEmbeddings(model="text-embedding-3-large"))
 
 
 
204
  focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
205
  elif isinstance(focus_input, str) and os.path.isdir(focus_input):
 
206
  focus_vector_store = load_vector_store_from_path(focus_input)
207
  focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
208
  else:
209
  raise ValueError("Invalid focus input type. Must be a PDF file or a path to a vector store.")
210
 
 
211
  focus_docs = focus_retriever.invoke(input_text)
212
 
 
213
  comparison_chunks = []
214
  for comparison_input in comparison_inputs:
215
  if isinstance(comparison_input, st.runtime.uploaded_file_manager.UploadedFile):
 
216
  comparison_docs = load_documents_from_pdf(comparison_input)
217
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
218
  comparison_splits = text_splitter.split_documents(comparison_docs)
219
+ comparison_vector_store = FAISS.from_documents(comparison_splits, OpenAIEmbeddings(model="text-embedding-3-large"))
 
 
 
220
  comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
221
  elif isinstance(comparison_input, str) and os.path.isdir(comparison_input):
 
222
  comparison_vector_store = load_vector_store_from_path(comparison_input)
223
  comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
224
  else:
225
  raise ValueError("Invalid comparison input type. Must be a PDF file or a path to a vector store.")
226
 
 
227
  comparison_docs = comparison_retriever.invoke(input_text)
228
  comparison_chunks.extend(comparison_docs)
229
 
230
  # Construct the combined context
231
+ combined_context = (
232
+ focus_docs +
233
+ comparison_chunks
234
+ )
235
 
236
  # Read the system prompt
237
  prompt_path = "Prompts/comparison_prompt.md"
 
252
  # Create the question-answering chain
253
  llm = ChatOpenAI(model="gpt-4o")
254
  question_answer_chain = create_stuff_documents_chain(
255
+ llm,
256
  prompt,
257
  document_variable_name="context"
258
  )
 
264
  })
265
 
266
  # Display the answer
267
+ display_placeholder.markdown(f"**Answer:**\n{result}")
 
268
 
269
  # Function to list vector store documents
270
  def list_vector_store_documents():
 
 
 
 
 
 
271
  # Assuming documents are stored in the "Individual_All_Vectorstores" directory
272
  directory_path = "Individual_All_Vectorstores"
273
  if not os.path.exists(directory_path):
274
+ raise FileNotFoundError(f"The directory '{directory_path}' does not exist. Run `create_and_save_individual_vector_stores()` to create it.")
 
 
 
275
  # List all available vector stores by document name
276
+ documents = [f.replace("_vectorstore", "").replace("_", " ") for f in os.listdir(directory_path) if f.endswith("_vectorstore")]
 
 
 
 
277
  return documents
278
 
279
+ def compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan_path, selected_summaries, display_placeholder):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  os.environ["OPENAI_API_KEY"] = api_key
281
  os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key
 
282
  # Load the focus plan
283
+
284
+ # Load focus documents or vector store
285
+ if isinstance(focus_plan_path, st.runtime.uploaded_file_manager.UploadedFile):
286
+ focus_docs = load_documents_from_pdf(focus_plan_path)
287
+ elif isinstance(focus_plan_path, str):
288
  focus_loader = PyPDFLoader(focus_plan_path)
289
  focus_docs = focus_loader.load()
 
 
 
 
 
290
 
291
  # Concatenate selected summary documents
292
  summaries_directory = "CAPS_Summaries"
293
  summaries_content = ""
294
  for filename in selected_summaries:
295
+ with open(os.path.join(summaries_directory, f"{filename.replace(" Summary", "_Summary")}.md"), 'r') as file:
296
  summaries_content += file.read() + "\n\n"
297
 
298
  # Prepare the context
 
311
  # Display the answer
312
  display_placeholder.markdown(f"**Answer:**\n{message.content}", unsafe_allow_html=True)
313
 
314
+
315
  # Streamlit app layout with tabs
316
  st.title("Climate Policy Analysis Tool")
317
 
 
319
  api_key = st.text_input("Enter your OpenAI API key:", type="password", key="openai_key")
320
 
321
  # Create tabs
322
+ tab1, tab2, tab3, tab4, tab5 = st.tabs(["Summary Generation", "Multi-Plan QA (Shared Vectorstore)", "Multi-Plan QA (Multi-Vectorstore)", "Plan Comparison Tool", "Plan Comparison with Long Context Model"])
 
 
 
 
 
 
323
 
324
  # First tab: Summary Generation
325
  with tab1:
326
+ uploaded_file = st.file_uploader("Upload a Climate Action Plan in PDF format", type="pdf", key="upload_file")
 
 
 
 
327
 
328
  prompt_file_path = "Prompts/summary_tool_system_prompt.md"
329
  questions_file_path = "Prompts/summary_tool_questions.md"
 
337
  display_placeholder = st.empty()
338
  with st.spinner("Processing..."):
339
  try:
340
+ results = process_pdf(api_key, uploaded_file, questions_file_path, prompt_file_path, display_placeholder)
341
+
 
 
 
 
 
342
  markdown_text = "\n".join(results)
343
+
344
  # Use the uploaded file's name for the download file
345
  base_name = os.path.splitext(uploaded_file.name)[0]
346
  download_file_name = f"{base_name}_Summary.md"
347
+
348
  st.download_button(
349
  label="Download Results as Markdown",
350
  data=markdown_text,
 
355
  except Exception as e:
356
  st.error(f"An error occurred: {e}")
357
 
358
+ # Second tab: Multi-Plan QA
359
  with tab2:
360
  input_text = st.text_input("Ask a question:", key="multi_plan_input")
361
  if st.button("Ask", key="multi_plan_qa_button"):
 
367
  display_placeholder2 = st.empty()
368
  with st.spinner("Processing..."):
369
  try:
370
+ process_multi_plan_qa(
371
  api_key,
372
  input_text,
373
  display_placeholder2
 
375
  except Exception as e:
376
  st.error(f"An error occurred: {e}")
377
 
378
+
379
  with tab3:
380
+ user_input = st.text_input("Ask a question:", key="multi_vectorstore_input")
381
  if st.button("Ask", key="multi_vectorstore_qa_button"):
382
  if not api_key:
383
  st.warning("Please provide your OpenAI API key.")
 
387
  display_placeholder3 = st.empty()
388
  with st.spinner("Processing..."):
389
  try:
390
+ multi_plan_qa_multi_vectorstore(
391
  api_key,
392
  user_input,
393
  display_placeholder3
 
403
  vectorstore_documents = list_vector_store_documents()
404
 
405
  # Option to upload a new plan or select from existing vector stores
406
+ focus_option = st.radio("Choose a focus plan:", ("Select from existing vector stores", "Upload a new plan"), key="focus_option")
 
 
 
 
407
 
408
  if focus_option == "Upload a new plan":
409
+ focus_uploaded_file = st.file_uploader("Upload a Climate Action Plan to compare", type="pdf", key="focus_upload")
410
+ if focus_uploaded_file is not None:
 
 
 
 
 
 
 
 
411
  # Directly use the uploaded file
412
  focus_input = focus_uploaded_file
413
  else:
414
  focus_input = None
415
  else:
416
  # Select a focus plan from existing vector stores
417
+ selected_focus_plan = st.selectbox("Select a focus plan:", vectorstore_documents, key="select_focus_plan")
418
+ focus_input = os.path.join("Individual_All_Vectorstores", f"{selected_focus_plan.replace(" Summary", "_Summary")}_vectorstore")
 
 
 
 
 
 
 
 
419
 
420
  # Option to upload comparison documents or select from existing vector stores
421
+ comparison_option = st.radio("Choose comparison documents:", ("Select from existing vector stores", "Upload new documents"), key="comparison_option")
 
 
 
 
422
 
423
  if comparison_option == "Upload new documents":
424
+ comparison_files = st.file_uploader("Upload comparison documents", type="pdf", accept_multiple_files=True, key="comparison_files")
 
 
 
 
 
425
  comparison_inputs = comparison_files
426
  else:
427
  # Select comparison documents from existing vector stores
428
+ selected_comparison_plans = st.multiselect("Select comparison documents:", vectorstore_documents, key="select_comparison_plans")
429
+ comparison_inputs = [os.path.join("Individual_All_Vectorstores", f"{doc.replace(" Summary", "_Summary")}_vectorstore") for doc in selected_comparison_plans]
 
 
 
 
 
 
 
 
 
430
 
431
+ input_text = st.text_input("Ask a comparison question:", key="comparison_input")
 
 
 
432
 
433
  if st.button("Compare", key="compare_button"):
434
  if not api_key:
 
443
  display_placeholder4 = st.empty()
444
  with st.spinner("Processing..."):
445
  try:
446
+ # Call the process_one_to_many_query function
447
+ process_one_to_many_query(api_key, focus_input, comparison_inputs, input_text, display_placeholder4)
448
+
 
 
 
 
449
  except Exception as e:
450
  st.error(f"An error occurred: {e}")
451
 
 
454
  st.header("Plan Comparison with Long Context Model")
455
 
456
  # Anthropics API Key Input
457
+ anthropic_api_key = st.text_input("Enter your Anthropic API key:", type="password", key="anthropic_key")
 
 
 
 
458
 
459
  # Option to upload a new plan or select from a list
460
+ focus_option = st.radio("Choose a focus plan:", ("Select from existing plans", "Upload a new plan"), key="focus_option_long_context")
 
 
 
 
461
 
462
+ if focus_option == "Upload a new plan":
463
+ focus_uploaded_file = st.file_uploader("Upload a Climate Action Plan to compare", type="pdf", key="focus_upload_long_context")
464
+ if focus_uploaded_file is not None:
465
+ # Directly use the uploaded file
466
+ focus_plan_path = focus_uploaded_file
 
 
 
 
 
 
 
 
 
 
467
  else:
468
  focus_plan_path = None
469
  else:
470
+ # Select a focus plan from existing vector stores
471
  plan_list = [f.replace(".pdf", "") for f in os.listdir("CAPS") if f.endswith('.pdf')]
472
+ selected_focus_plan = st.selectbox("Select a focus plan:", plan_list, key="select_focus_plan_long_context")
473
+ focus_plan_path = os.path.join("CAPS", f"{selected_focus_plan}.pdf")
 
 
 
 
 
 
474
 
475
  # List available summary documents for selection
476
  summaries_directory = "CAPS_Summaries"
477
+ summary_files = [f.replace(".md", "").replace("_", " ") for f in os.listdir(summaries_directory) if f.endswith('.md')]
478
+ selected_summaries = st.multiselect("Select summary documents for comparison:", summary_files, key="selected_summaries")
 
 
 
 
 
 
 
479
 
480
+ input_text = st.text_input("Ask a comparison question:", key="comparison_input_long_context")
 
 
 
481
 
482
  if st.button("Compare with Long Context", key="compare_button_long_context"):
483
  if not api_key:
 
488
  st.warning("Please enter a comparison question.")
489
  elif not focus_plan_path:
490
  st.warning("Please provide a focus plan.")
 
 
491
  else:
492
  display_placeholder = st.empty()
493
  with st.spinner("Processing..."):
494
  try:
495
+ compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan_path, selected_summaries, display_placeholder)
 
 
 
 
 
 
 
 
496
  except Exception as e:
497
+ st.error(f"An error occurred: {e}")