umangchaudhry commited on
Commit
949c8ba
·
verified ·
1 Parent(s): 52e78c0

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +251 -23
  2. batch_summary_generation.py +100 -0
app.py CHANGED
@@ -1,16 +1,17 @@
1
  import os
2
  import streamlit as st
3
- from io import BytesIO
4
  from tempfile import NamedTemporaryFile
5
  from langchain.chains import create_retrieval_chain
6
  from langchain.chains.combine_documents import create_stuff_documents_chain
7
  from langchain_core.prompts import ChatPromptTemplate
8
  from langchain_openai import ChatOpenAI
9
  from langchain_community.document_loaders import PyPDFLoader
 
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_openai import OpenAIEmbeddings
12
  from langchain_text_splitters import RecursiveCharacterTextSplitter
13
  import re
 
14
 
15
  # Function to remove code block markers from the answer
16
  def remove_code_blocks(text):
@@ -84,13 +85,13 @@ def process_multi_plan_qa(api_key, input_text, display_placeholder):
84
 
85
  # Load the existing vector store
86
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
87
- vector_store = FAISS.load_local("multi_plan_vectorstore", embeddings, allow_dangerous_deserialization=True)
88
 
89
  # Convert the vector store to a retriever
90
- retriever = vector_store.as_retriever(search_kwargs={"k": 10})
91
 
92
  # Read the system prompt for multi-document QA
93
- prompt_path = "multi_document_qa_system_prompt.md"
94
  if os.path.exists(prompt_path):
95
  with open(prompt_path, "r") as file:
96
  system_prompt = file.read()
@@ -117,12 +118,11 @@ def process_multi_plan_qa(api_key, input_text, display_placeholder):
117
  # Display the answer
118
  display_placeholder.markdown(f"**Answer:**\n{answer}")
119
 
120
-
121
  def multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
122
  os.environ["OPENAI_API_KEY"] = api_key
123
 
124
  # Directory containing individual vector stores
125
- vectorstore_directory = "Individual_Vectorstores"
126
 
127
  # List all vector store directories
128
  vectorstore_names = [d for d in os.listdir(vectorstore_directory) if os.path.isdir(os.path.join(vectorstore_directory, d))]
@@ -139,15 +139,14 @@ def multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
139
  vector_store = FAISS.load_local(vectorstore_path, embeddings, allow_dangerous_deserialization=True)
140
 
141
  # Convert the vector store to a retriever
142
- retriever = vector_store.as_retriever(search_kwargs={"k": 4})
143
 
144
  # Retrieve relevant chunks for the input text
145
  retrieved_chunks = retriever.invoke("input_text")
146
- print(retrieved_chunks)
147
  all_retrieved_chunks.extend(retrieved_chunks)
148
 
149
  # Read the system prompt for multi-document QA
150
- prompt_path = "multi_document_qa_system_prompt.md"
151
  if os.path.exists(prompt_path):
152
  with open(prompt_path, "r") as file:
153
  system_prompt = file.read()
@@ -173,23 +172,161 @@ def multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
173
  display_placeholder.markdown(f"**Answer:**\n{result}")
174
 
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  # Streamlit app layout with tabs
177
  st.title("Climate Policy Analysis Tool")
178
 
179
  # API Key Input
180
- api_key = st.text_input("Enter your OpenAI API key:", type="password")
181
 
182
  # Create tabs
183
- tab1, tab2, tab3 = st.tabs(["Summary Generation", "Multi-Plan QA (Shared Vectorstore)", "Multi-Plan QA (Multi-Vectorstore)"])
184
 
185
  # First tab: Summary Generation
186
  with tab1:
187
- uploaded_file = st.file_uploader("Upload a Climate Action Plan in PDF format", type="pdf")
188
 
189
- prompt_file_path = "summary_tool_system_prompt.md"
190
- questions_file_path = "summary_tool_questions.md"
191
 
192
- if st.button("Generate") and api_key and uploaded_file:
193
  display_placeholder = st.empty()
194
 
195
  with st.spinner("Processing..."):
@@ -200,26 +337,117 @@ with tab1:
200
 
201
  # Use the uploaded file's name for the download file
202
  base_name = os.path.splitext(uploaded_file.name)[0]
203
- download_file_name = f"{base_name}_summary.md"
204
 
205
  st.download_button(
206
  label="Download Results as Markdown",
207
  data=markdown_text,
208
  file_name=download_file_name,
209
- mime="text/markdown"
 
210
  )
211
  except Exception as e:
212
  st.error(f"An error occurred: {e}")
213
 
214
  # Second tab: Multi-Plan QA
215
  with tab2:
216
- input_text = st.text_input("Ask a question:")
217
  if input_text and api_key:
218
- display_placeholder = st.empty()
219
- process_multi_plan_qa(api_key, input_text, display_placeholder)
220
 
221
  with tab3:
222
- user_input = st.text_input("Ask a Question")
223
  if user_input and api_key:
224
- display_placeholder2 = st.empty()
225
- multi_plan_qa_multi_vectorstore(api_key, user_input, display_placeholder2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import streamlit as st
 
3
  from tempfile import NamedTemporaryFile
4
  from langchain.chains import create_retrieval_chain
5
  from langchain.chains.combine_documents import create_stuff_documents_chain
6
  from langchain_core.prompts import ChatPromptTemplate
7
  from langchain_openai import ChatOpenAI
8
  from langchain_community.document_loaders import PyPDFLoader
9
+ from langchain_community.document_loaders import TextLoader
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_openai import OpenAIEmbeddings
12
  from langchain_text_splitters import RecursiveCharacterTextSplitter
13
  import re
14
+ import anthropic
15
 
16
  # Function to remove code block markers from the answer
17
  def remove_code_blocks(text):
 
85
 
86
  # Load the existing vector store
87
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
88
+ vector_store = FAISS.load_local("Combined_Summary_Vectorstore", embeddings, allow_dangerous_deserialization=True)
89
 
90
  # Convert the vector store to a retriever
91
+ retriever = vector_store.as_retriever(search_kwargs={"k": 50})
92
 
93
  # Read the system prompt for multi-document QA
94
+ prompt_path = "Prompts/multi_document_qa_system_prompt.md"
95
  if os.path.exists(prompt_path):
96
  with open(prompt_path, "r") as file:
97
  system_prompt = file.read()
 
118
  # Display the answer
119
  display_placeholder.markdown(f"**Answer:**\n{answer}")
120
 
 
121
  def multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
122
  os.environ["OPENAI_API_KEY"] = api_key
123
 
124
  # Directory containing individual vector stores
125
+ vectorstore_directory = "Individual_Summary_Vectorstores"
126
 
127
  # List all vector store directories
128
  vectorstore_names = [d for d in os.listdir(vectorstore_directory) if os.path.isdir(os.path.join(vectorstore_directory, d))]
 
139
  vector_store = FAISS.load_local(vectorstore_path, embeddings, allow_dangerous_deserialization=True)
140
 
141
  # Convert the vector store to a retriever
142
+ retriever = vector_store.as_retriever(search_kwargs={"k": 2})
143
 
144
  # Retrieve relevant chunks for the input text
145
  retrieved_chunks = retriever.invoke("input_text")
 
146
  all_retrieved_chunks.extend(retrieved_chunks)
147
 
148
  # Read the system prompt for multi-document QA
149
+ prompt_path = "Prompts/multi_document_qa_system_prompt.md"
150
  if os.path.exists(prompt_path):
151
  with open(prompt_path, "r") as file:
152
  system_prompt = file.read()
 
172
  display_placeholder.markdown(f"**Answer:**\n{result}")
173
 
174
 
175
+ # Function to compare document via one-to-many query approach
176
+ def process_one_to_many_query(api_key, focus_input, comparison_inputs, input_text, display_placeholder):
177
+ os.environ["OPENAI_API_KEY"] = api_key
178
+
179
+ def load_documents_from_pdf(file):
180
+ with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
181
+ temp_pdf.write(file.read())
182
+ temp_pdf_path = temp_pdf.name
183
+
184
+ loader = PyPDFLoader(temp_pdf_path)
185
+ docs = loader.load()
186
+ os.remove(temp_pdf_path)
187
+ return docs
188
+
189
+ def load_vector_store_from_path(path):
190
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
191
+ return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
192
+
193
+ # Load focus documents or vector store
194
+ if isinstance(focus_input, st.runtime.uploaded_file_manager.UploadedFile):
195
+ focus_docs = load_documents_from_pdf(focus_input)
196
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
197
+ focus_splits = text_splitter.split_documents(focus_docs)
198
+ focus_vector_store = FAISS.from_documents(focus_splits, OpenAIEmbeddings(model="text-embedding-3-large"))
199
+ focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
200
+ elif isinstance(focus_input, str) and os.path.isdir(focus_input):
201
+ focus_vector_store = load_vector_store_from_path(focus_input)
202
+ focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
203
+ else:
204
+ raise ValueError("Invalid focus input type. Must be a PDF file or a path to a vector store.")
205
+
206
+ focus_docs = focus_retriever.invoke(input_text)
207
+
208
+ comparison_chunks = []
209
+ for comparison_input in comparison_inputs:
210
+ if isinstance(comparison_input, st.runtime.uploaded_file_manager.UploadedFile):
211
+ comparison_docs = load_documents_from_pdf(comparison_input)
212
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
213
+ comparison_splits = text_splitter.split_documents(comparison_docs)
214
+ comparison_vector_store = FAISS.from_documents(comparison_splits, OpenAIEmbeddings(model="text-embedding-3-large"))
215
+ comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
216
+ elif isinstance(comparison_input, str) and os.path.isdir(comparison_input):
217
+ comparison_vector_store = load_vector_store_from_path(comparison_input)
218
+ comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
219
+ else:
220
+ raise ValueError("Invalid comparison input type. Must be a PDF file or a path to a vector store.")
221
+
222
+ comparison_docs = comparison_retriever.invoke(input_text)
223
+ comparison_chunks.extend(comparison_docs)
224
+
225
+ # Construct the combined context
226
+ combined_context = (
227
+ focus_docs +
228
+ comparison_chunks
229
+ )
230
+
231
+ # Read the system prompt
232
+ prompt_path = "Prompts/comparison_prompt.md"
233
+ if os.path.exists(prompt_path):
234
+ with open(prompt_path, "r") as file:
235
+ system_prompt = file.read()
236
+ else:
237
+ raise FileNotFoundError(f"The specified file was not found: {prompt_path}")
238
+
239
+ # Create the prompt template
240
+ prompt = ChatPromptTemplate.from_messages(
241
+ [
242
+ ("system", system_prompt),
243
+ ("human", "{input}")
244
+ ]
245
+ )
246
+
247
+ # Create the question-answering chain
248
+ llm = ChatOpenAI(model="gpt-4o")
249
+ question_answer_chain = create_stuff_documents_chain(
250
+ llm,
251
+ prompt,
252
+ document_variable_name="context"
253
+ )
254
+
255
+ # Process the combined context
256
+ result = question_answer_chain.invoke({
257
+ "context": combined_context,
258
+ "input": input_text
259
+ })
260
+
261
+ # Display the answer
262
+ display_placeholder.markdown(f"**Answer:**\n{result}")
263
+
264
+ # Function to list vector store documents
265
+ def list_vector_store_documents():
266
+ # Assuming documents are stored in the "Individual_All_Vectorstores" directory
267
+ directory_path = "Individual_All_Vectorstores"
268
+ if not os.path.exists(directory_path):
269
+ raise FileNotFoundError(f"The directory '{directory_path}' does not exist. Run `create_and_save_individual_vector_stores()` to create it.")
270
+ # List all available vector stores by document name
271
+ documents = [f.replace("_vectorstore", "").replace("_", " ") for f in os.listdir(directory_path) if f.endswith("_vectorstore")]
272
+ return documents
273
+
274
+ def compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan_path, focus_city_name, selected_summaries, display_placeholder):
275
+ os.environ["OPENAI_API_KEY"] = api_key
276
+ os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key
277
+
278
+ # Load the focus plan
279
+ focus_docs = []
280
+ if focus_plan_path.endswith('.pdf'):
281
+ focus_loader = PyPDFLoader(focus_plan_path)
282
+ focus_docs = focus_loader.load()
283
+ elif focus_plan_path.endswith('.md'):
284
+ focus_loader = TextLoader(focus_plan_path)
285
+ focus_docs = focus_loader.load()
286
+ else:
287
+ raise ValueError("Unsupported file format for focus plan.")
288
+
289
+ # Concatenate selected summary documents
290
+ summaries_directory = "CAPS_Summaries"
291
+ summaries_content = ""
292
+ for filename in selected_summaries:
293
+ with open(os.path.join(summaries_directory, filename), 'r') as file:
294
+ summaries_content += file.read() + "\n\n"
295
+
296
+ # Prepare the context
297
+ focus_context = "\n\n".join([doc.page_content for doc in focus_docs])
298
+
299
+ # Create the client and message
300
+ client = anthropic.Anthropic(api_key=anthropic_api_key)
301
+ message = client.messages.create(
302
+ model="claude-3-5-sonnet-20241022",
303
+ max_tokens=1024,
304
+ messages=[
305
+ {"role": "user", "content": f"{input_text}\n\nFocus Document:\n{focus_context}\n\nSummaries:\n{summaries_content}"}
306
+ ]
307
+ )
308
+
309
+ # Display the answer
310
+ display_placeholder.markdown(f"**Answer:**\n{message.content}", unsafe_allow_html=True)
311
+
312
+
313
  # Streamlit app layout with tabs
314
  st.title("Climate Policy Analysis Tool")
315
 
316
  # API Key Input
317
+ api_key = st.text_input("Enter your OpenAI API key:", type="password", key="openai_key")
318
 
319
  # Create tabs
320
+ tab1, tab2, tab3, tab4, tab5 = st.tabs(["Summary Generation", "Multi-Plan QA (Shared Vectorstore)", "Multi-Plan QA (Multi-Vectorstore)", "Plan Comparison Tool", "Plan Comparison with Long Context Model"])
321
 
322
  # First tab: Summary Generation
323
  with tab1:
324
+ uploaded_file = st.file_uploader("Upload a Climate Action Plan in PDF format", type="pdf", key="upload_file")
325
 
326
+ prompt_file_path = "Prompts/summary_tool_system_prompt.md"
327
+ questions_file_path = "Prompts/summary_tool_questions.md"
328
 
329
+ if st.button("Generate", key="generate_button") and api_key and uploaded_file:
330
  display_placeholder = st.empty()
331
 
332
  with st.spinner("Processing..."):
 
337
 
338
  # Use the uploaded file's name for the download file
339
  base_name = os.path.splitext(uploaded_file.name)[0]
340
+ download_file_name = f"{base_name}_Summary.md"
341
 
342
  st.download_button(
343
  label="Download Results as Markdown",
344
  data=markdown_text,
345
  file_name=download_file_name,
346
+ mime="text/markdown",
347
+ key="download_button"
348
  )
349
  except Exception as e:
350
  st.error(f"An error occurred: {e}")
351
 
352
  # Second tab: Multi-Plan QA
353
  with tab2:
354
+ input_text = st.text_input("Ask a question:", key="multi_plan_input")
355
  if input_text and api_key:
356
+ display_placeholder2 = st.empty()
357
+ process_multi_plan_qa(api_key, input_text, display_placeholder2)
358
 
359
  with tab3:
360
+ user_input = st.text_input("Ask a Question", key="multi_vectorstore_input")
361
  if user_input and api_key:
362
+ display_placeholder3 = st.empty()
363
+ multi_plan_qa_multi_vectorstore(api_key, user_input, display_placeholder3)
364
+
365
+ # Fourth tab: Plan Comparison Tool
366
+ with tab4:
367
+ st.header("Plan Comparison Tool")
368
+
369
+ # List of documents from vector stores
370
+ vectorstore_documents = list_vector_store_documents()
371
+
372
+ # Option to upload a new plan or select from existing vector stores
373
+ focus_option = st.radio("Choose a focus plan:", ("Select from existing vector stores", "Upload a new plan"), key="focus_option")
374
+
375
+ if focus_option == "Upload a new plan":
376
+ focus_uploaded_file = st.file_uploader("Upload a Climate Action Plan to compare", type="pdf", key="focus_upload")
377
+ focus_city_name = st.text_input("Enter the city name for the uploaded plan:", key="focus_city_name")
378
+ if focus_uploaded_file is not None and focus_city_name:
379
+ # Directly use the uploaded file
380
+ focus_input = focus_uploaded_file
381
+ else:
382
+ focus_input = None
383
+ else:
384
+ # Select a focus plan from existing vector stores
385
+ selected_focus_plan = st.selectbox("Select a focus plan:", vectorstore_documents, key="select_focus_plan")
386
+ focus_input = os.path.join("Individual_All_Vectorstores", f"{selected_focus_plan}_vectorstore")
387
+ focus_city_name = selected_focus_plan.replace("_", " ")
388
+
389
+ # Option to upload comparison documents or select from existing vector stores
390
+ comparison_option = st.radio("Choose comparison documents:", ("Select from existing vector stores", "Upload new documents"), key="comparison_option")
391
+
392
+ if comparison_option == "Upload new documents":
393
+ comparison_files = st.file_uploader("Upload comparison documents", type="pdf", accept_multiple_files=True, key="comparison_files")
394
+ comparison_inputs = comparison_files
395
+ else:
396
+ # Select comparison documents from existing vector stores
397
+ selected_comparison_plans = st.multiselect("Select comparison documents:", vectorstore_documents, key="select_comparison_plans")
398
+ comparison_inputs = [os.path.join("Individual_All_Vectorstores", f"{doc}_vectorstore") for doc in selected_comparison_plans]
399
+
400
+ input_text = st.text_input("Ask a comparison question:", key="comparison_input")
401
+
402
+ if st.button("Compare", key="compare_button") and api_key and input_text and focus_input and comparison_inputs:
403
+ display_placeholder4 = st.empty()
404
+ with st.spinner("Processing..."):
405
+ try:
406
+ # Call the process_one_to_many_query function
407
+ process_one_to_many_query(api_key, focus_input, comparison_inputs, input_text, display_placeholder4)
408
+
409
+ except Exception as e:
410
+ st.error(f"An error occurred: {e}")
411
+
412
+ # Fifth tab: Plan Comparison with Long Context Model
413
+ with tab5:
414
+ st.header("Plan Comparison with Long Context Model")
415
+
416
+ # Anthropics API Key Input
417
+ anthropic_api_key = st.text_input("Enter your Anthropic API key:", type="password", key="anthropic_key")
418
+
419
+ # Option to upload a new plan or select from a list
420
+ upload_option = st.radio("Choose a focus plan:", ("Select from existing plans", "Upload a new plan"), key="upload_option_long_context")
421
+
422
+ if upload_option == "Upload a new plan":
423
+ focus_uploaded_file = st.file_uploader("Upload a Climate Action Plan to compare", type="pdf", key="focus_upload_long_context")
424
+ focus_city_name = st.text_input("Enter the city name for the uploaded plan:", key="focus_city_name_long_context")
425
+ if focus_uploaded_file is not None and focus_city_name:
426
+ # Save uploaded file temporarily
427
+ with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
428
+ temp_pdf.write(focus_uploaded_file.read())
429
+ focus_plan_path = temp_pdf.name
430
+ else:
431
+ focus_plan_path = None
432
+ else:
433
+ # List of existing plans in CAPS
434
+ plan_list = [f.replace(".pdf", "") for f in os.listdir("CAPS") if f.endswith('.pdf')]
435
+ selected_plan = st.selectbox("Select a plan:", plan_list, key="selected_plan_long_context")
436
+ focus_plan_path = os.path.join("CAPS", selected_plan)
437
+ # Extract city name from the file name
438
+ focus_city_name = os.path.splitext(selected_plan)[0].replace("_", " ")
439
+
440
+ # List available summary documents for selection
441
+ summaries_directory = "CAPS_Summaries"
442
+ summary_files = [f.replace(".md", "").replace("_", " ") for f in os.listdir(summaries_directory) if f.endswith('.md')]
443
+ selected_summaries = st.multiselect("Select summary documents for comparison:", summary_files, key="selected_summaries")
444
+
445
+ input_text = st.text_input("Ask a comparison question:", key="comparison_input_long_context")
446
+
447
+ if st.button("Compare with Long Context", key="compare_button_long_context") and api_key and anthropic_api_key and input_text and focus_plan_path and focus_city_name:
448
+ display_placeholder = st.empty()
449
+ with st.spinner("Processing..."):
450
+ try:
451
+ compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan_path, focus_city_name, selected_summaries, display_placeholder)
452
+ except Exception as e:
453
+ st.error(f"An error occurred: {e}")
batch_summary_generation.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from tempfile import NamedTemporaryFile
3
+ from langchain.chains import create_retrieval_chain
4
+ from langchain.chains.combine_documents import create_stuff_documents_chain
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_openai import ChatOpenAI
7
+ from langchain_community.document_loaders import PyPDFLoader
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain_openai import OpenAIEmbeddings
10
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
11
+
12
+ def process_pdf(api_key, pdf_path, questions_path, prompt_path):
13
+ os.environ["OPENAI_API_KEY"] = api_key
14
+
15
+ with open(pdf_path, "rb") as file:
16
+ with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
17
+ temp_pdf.write(file.read())
18
+ temp_pdf_path = temp_pdf.name
19
+
20
+ loader = PyPDFLoader(temp_pdf_path)
21
+ docs = loader.load()
22
+
23
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
24
+ splits = text_splitter.split_documents(docs)
25
+
26
+ vectorstore = FAISS.from_documents(
27
+ documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-large")
28
+ )
29
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
30
+
31
+ if os.path.exists(prompt_path):
32
+ with open(prompt_path, "r") as file:
33
+ system_prompt = file.read()
34
+ else:
35
+ raise FileNotFoundError(f"The specified file was not found: {prompt_path}")
36
+
37
+ prompt = ChatPromptTemplate.from_messages(
38
+ [
39
+ ("system", system_prompt),
40
+ ("human", "{input}"),
41
+ ]
42
+ )
43
+
44
+ llm = ChatOpenAI(model="gpt-4o")
45
+ question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
46
+ rag_chain = create_retrieval_chain(retriever, question_answer_chain)
47
+
48
+ if os.path.exists(questions_path):
49
+ with open(questions_path, "r") as file:
50
+ questions = [line.strip() for line in file.readlines() if line.strip()]
51
+ else:
52
+ raise FileNotFoundError(f"The specified file was not found: {questions_path}")
53
+
54
+ qa_results = []
55
+ for question in questions:
56
+ result = rag_chain.invoke({"input": question})
57
+ answer = result["answer"]
58
+
59
+ qa_text = f"### Question: {question}\n**Answer:**\n{answer}\n"
60
+ qa_results.append(qa_text)
61
+
62
+ os.remove(temp_pdf_path)
63
+
64
+ return qa_results
65
+
66
+ def main():
67
+ # Get user input for directory path and API key
68
+ directory_path = input("Enter the path to the folder containing the PDF plans: ").strip()
69
+ api_key = input("Enter your OpenAI API key: ").strip()
70
+
71
+ # Paths for prompt and questions files
72
+ prompt_file_path = "summary_tool_system_prompt.md"
73
+ questions_file_path = "summary_tool_questions.md"
74
+
75
+ # Create output directory if it doesn't exist
76
+ output_directory = "CAPS_Summaries"
77
+ os.makedirs(output_directory, exist_ok=True)
78
+
79
+ # Process each PDF in the directory
80
+ for filename in os.listdir(directory_path):
81
+ if filename.endswith(".pdf"):
82
+ pdf_path = os.path.join(directory_path, filename)
83
+ print(f"Processing {filename}...")
84
+
85
+ try:
86
+ results = process_pdf(api_key, pdf_path, questions_file_path, prompt_file_path)
87
+ markdown_text = "\n".join(results)
88
+
89
+ # Save the results to a Markdown file
90
+ base_name = os.path.splitext(filename)[0]
91
+ output_file_path = os.path.join(output_directory, f"{base_name}_Summary.md")
92
+ with open(output_file_path, "w") as output_file:
93
+ output_file.write(markdown_text)
94
+
95
+ print(f"Summary for {filename} saved to {output_file_path}")
96
+ except Exception as e:
97
+ print(f"An error occurred while processing {filename}: {e}")
98
+
99
+ if __name__ == "__main__":
100
+ main()