import os import re import streamlit as st from tempfile import NamedTemporaryFile import anthropic # Import necessary modules from LangChain from langchain.chains import create_retrieval_chain from langchain.chains.combine_documents import create_stuff_documents_chain from langchain_core.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_community.document_loaders import PyPDFLoader, TextLoader from langchain_community.vectorstores import FAISS from langchain_text_splitters import RecursiveCharacterTextSplitter # Function to remove code block markers from the answer def remove_code_blocks(text): """ Removes code block markers from the answer text. Args: text (str): The text from which code block markers should be removed. Returns: str: The text without code block markers. """ code_block_pattern = r"^```(?:\w+)?\n(.*?)\n```$" match = re.match(code_block_pattern, text, re.DOTALL) if match: return match.group(1).strip() else: return text # Function to process PDF, run Q&A, and return results def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder): """ Processes a PDF file, runs Q&A, and returns the results. Args: api_key (str): OpenAI API key. uploaded_file: Uploaded PDF file. questions_path (str): Path to the questions file. prompt_path (str): Path to the system prompt file. display_placeholder: Streamlit placeholder for displaying results. Returns: list: List of QA results. """ # Set the OpenAI API key os.environ["OPENAI_API_KEY"] = api_key # Save the uploaded PDF to a temporary file with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: temp_pdf.write(uploaded_file.read()) temp_pdf_path = temp_pdf.name # Load and split the PDF into documents loader = PyPDFLoader(temp_pdf_path) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) splits = text_splitter.split_documents(docs) # Create a vector store from the documents vectorstore = FAISS.from_documents( documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-large") ) retriever = vectorstore.as_retriever(search_kwargs={"k": 10}) # Load the system prompt if os.path.exists(prompt_path): with open(prompt_path, "r") as file: system_prompt = file.read() else: raise FileNotFoundError(f"The specified file was not found: {prompt_path}") # Create the prompt template prompt = ChatPromptTemplate.from_messages( [ ("system", system_prompt), ("human", "{input}"), ] ) # Initialize the language model llm = ChatOpenAI(model="gpt-4o") # Create the question-answering chain question_answer_chain = create_stuff_documents_chain( llm, prompt, document_variable_name="context" ) rag_chain = create_retrieval_chain(retriever, question_answer_chain) # Load the questions if os.path.exists(questions_path): with open(questions_path, "r") as file: questions = [line.strip() for line in file.readlines() if line.strip()] else: raise FileNotFoundError(f"The specified file was not found: {questions_path}") # Process each question qa_results = [] for question in questions: result = rag_chain.invoke({"input": question}) answer = result["answer"] # Remove code block markers answer = remove_code_blocks(answer) qa_text = f"### Question: {question}\n**Answer:**\n{answer}\n" qa_results.append(qa_text) display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True) # Clean up temporary PDF file os.remove(temp_pdf_path) return qa_results # Function to perform multi-plan QA using an existing vector store def process_multi_plan_qa(api_key, input_text, display_placeholder): """ Performs multi-plan QA using an existing shared vector store. Args: api_key (str): OpenAI API key. input_text (str): The question to ask. display_placeholder: Streamlit placeholder for displaying results. """ # Set the OpenAI API key os.environ["OPENAI_API_KEY"] = api_key # Load the existing vector store embeddings = OpenAIEmbeddings(model="text-embedding-3-large") vector_store = FAISS.load_local( "Combined_Summary_Vectorstore", embeddings, allow_dangerous_deserialization=True ) # Convert the vector store to a retriever retriever = vector_store.as_retriever(search_kwargs={"k": 50}) # Read the system prompt for multi-document QA prompt_path = "Prompts/multi_document_qa_system_prompt.md" if os.path.exists(prompt_path): with open(prompt_path, "r") as file: system_prompt = file.read() else: raise FileNotFoundError(f"The specified file was not found: {prompt_path}") # Create the prompt template prompt = ChatPromptTemplate.from_messages( [ ("system", system_prompt), ("human", "{input}"), ] ) # Create the question-answering chain llm = ChatOpenAI(model="gpt-4o") question_answer_chain = create_stuff_documents_chain( llm, prompt, document_variable_name="context" ) rag_chain = create_retrieval_chain(retriever, question_answer_chain) # Process the input text result = rag_chain.invoke({"input": input_text}) answer = result["answer"] # Display the answer display_placeholder.markdown(f"**Answer:**\n{answer}") # Function to perform multi-plan QA using multiple individual vector stores def process_multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder): """ Performs multi-plan QA using multiple individual vector stores. Args: api_key (str): OpenAI API key. input_text (str): The question to ask. display_placeholder: Streamlit placeholder for displaying results. """ # Set the OpenAI API key os.environ["OPENAI_API_KEY"] = api_key # Directory containing individual vector stores vectorstore_directory = "Individual_Summary_Vectorstores" # List all vector store directories vectorstore_names = [ d for d in os.listdir(vectorstore_directory) if os.path.isdir(os.path.join(vectorstore_directory, d)) ] # Initialize a list to collect all retrieved chunks all_retrieved_chunks = [] # Process each vector store for vectorstore_name in vectorstore_names: vectorstore_path = os.path.join(vectorstore_directory, vectorstore_name) # Load the vector store embeddings = OpenAIEmbeddings(model="text-embedding-3-large") vector_store = FAISS.load_local( vectorstore_path, embeddings, allow_dangerous_deserialization=True ) # Convert the vector store to a retriever retriever = vector_store.as_retriever(search_kwargs={"k": 2}) # Retrieve relevant chunks for the input text retrieved_chunks = retriever.invoke(input_text) all_retrieved_chunks.extend(retrieved_chunks) # Read the system prompt for multi-document QA prompt_path = "Prompts/multi_document_qa_system_prompt.md" if os.path.exists(prompt_path): with open(prompt_path, "r") as file: system_prompt = file.read() else: raise FileNotFoundError(f"The specified file was not found: {prompt_path}") # Create the prompt template prompt = ChatPromptTemplate.from_messages( [ ("system", system_prompt), ("human", "{input}"), ] ) # Create the question-answering chain llm = ChatOpenAI(model="gpt-4o") question_answer_chain = create_stuff_documents_chain( llm, prompt, document_variable_name="context" ) # Process the combined context result = question_answer_chain.invoke({ "input": input_text, "context": all_retrieved_chunks }) # Display the answer answer = result["answer"] if "answer" in result else result display_placeholder.markdown(f"**Answer:**\n{answer}") def load_documents_from_pdf(file): """ Loads documents from a PDF file. Args: file: Uploaded PDF file. Returns: list: List of documents. """ # Check if the file is a PDF if not file.name.endswith('.pdf'): raise ValueError("The uploaded file is not a PDF. Please upload a PDF file.") with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: temp_pdf.write(file.read()) temp_pdf_path = temp_pdf.name loader = PyPDFLoader(temp_pdf_path) docs = loader.load() os.remove(temp_pdf_path) return docs def load_vector_store_from_path(path): """ Loads a vector store from a given path. Args: path (str): Path to the vector store. Returns: FAISS: Loaded vector store. """ embeddings = OpenAIEmbeddings(model="text-embedding-3-large") return FAISS.load_local( path, embeddings, allow_dangerous_deserialization=True ) # Function to compare documents via one-to-many query approach def process_one_to_many_query(api_key, focus_input, comparison_inputs, input_text, display_placeholder): """ Compares a focus document against multiple comparison documents using a one-to-many query approach. Args: api_key (str): OpenAI API key. focus_input: Focus document (uploaded file or path to vector store). comparison_inputs: List of comparison documents (uploaded files or paths to vector stores). input_text (str): The comparison question to ask. display_placeholder: Streamlit placeholder for displaying results. """ # Set the OpenAI API key os.environ["OPENAI_API_KEY"] = api_key print(comparison_inputs) # Load focus documents or vector store if isinstance(focus_input, st.runtime.uploaded_file_manager.UploadedFile): # If focus_input is an uploaded PDF file focus_docs = load_documents_from_pdf(focus_input) text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) focus_splits = text_splitter.split_documents(focus_docs) focus_vector_store = FAISS.from_documents( focus_splits, OpenAIEmbeddings(model="text-embedding-3-large") ) focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5}) elif isinstance(focus_input, str) and os.path.isdir(focus_input): # If focus_input is a path to a vector store focus_vector_store = load_vector_store_from_path(focus_input) focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5}) else: raise ValueError("Invalid focus input type. Must be a PDF file or a path to a vector store.") # Retrieve relevant chunks from the focus document focus_docs = focus_retriever.invoke(input_text) # Initialize list to collect comparison chunks comparison_chunks = [] for comparison_input in comparison_inputs: if isinstance(comparison_input, st.runtime.uploaded_file_manager.UploadedFile): # If comparison_input is an uploaded PDF file comparison_docs = load_documents_from_pdf(comparison_input) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500) comparison_splits = text_splitter.split_documents(comparison_docs) comparison_vector_store = FAISS.from_documents( comparison_splits, OpenAIEmbeddings(model="text-embedding-3-large") ) comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5}) elif isinstance(comparison_input, str) and os.path.isdir(comparison_input): # If comparison_input is a path to a vector store comparison_vector_store = load_vector_store_from_path(comparison_input) comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5}) else: raise ValueError("Invalid comparison input type. Must be a PDF file or a path to a vector store.") # Retrieve relevant chunks from the comparison document comparison_docs = comparison_retriever.invoke(input_text) comparison_chunks.extend(comparison_docs) # Construct the combined context combined_context = focus_docs + comparison_chunks # Read the system prompt prompt_path = "Prompts/comparison_prompt.md" if os.path.exists(prompt_path): with open(prompt_path, "r") as file: system_prompt = file.read() else: raise FileNotFoundError(f"The specified file was not found: {prompt_path}") # Create the prompt template prompt = ChatPromptTemplate.from_messages( [ ("system", system_prompt), ("human", "{input}") ] ) # Create the question-answering chain llm = ChatOpenAI(model="gpt-4o") question_answer_chain = create_stuff_documents_chain( llm, prompt, document_variable_name="context" ) # Process the combined context result = question_answer_chain.invoke({ "context": combined_context, "input": input_text }) # Display the answer answer = result["answer"] if "answer" in result else result display_placeholder.markdown(f"**Answer:**\n{answer}") # Function to list vector store documents def list_vector_store_documents(): """ Lists available vector store documents. Returns: list: List of document names. """ # Assuming documents are stored in the "Individual_All_Vectorstores" directory directory_path = "Individual_All_Vectorstores" if not os.path.exists(directory_path): raise FileNotFoundError( f"The directory '{directory_path}' does not exist. " "Run `create_and_save_individual_vector_stores()` to create it." ) # List all available vector stores by document name documents = [ f.replace("_vectorstore", "").replace("_", " ") for f in os.listdir(directory_path) if f.endswith("_vectorstore") ] return documents # Function to compare plans using a long context model def compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan_path, selected_summaries, display_placeholder): """ Compares plans using a long context model. Args: api_key (str): OpenAI API key. anthropic_api_key (str): Anthropic API key. input_text (str): The comparison question to ask. focus_plan_path: Path to the focus plan or uploaded file. selected_summaries (list): List of selected summary documents. display_placeholder: Streamlit placeholder for displaying results. """ # Set the API keys os.environ["OPENAI_API_KEY"] = api_key os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key # Load focus documents if isinstance(focus_plan_path, st.runtime.uploaded_file_manager.UploadedFile): # If focus_plan_path is an uploaded file focus_docs = load_documents_from_pdf(focus_plan_path) elif isinstance(focus_plan_path, str): # If focus_plan_path is a file path focus_loader = PyPDFLoader(focus_plan_path) focus_docs = focus_loader.load() else: raise ValueError("Invalid focus plan input type. Must be an uploaded file or a file path.") # Concatenate selected summary documents summaries_directory = "CAPS_Summaries" summaries_content = "" for filename in selected_summaries: # Fix the filename by replacing ' Summary' with '_Summary' summary_filename = f"{filename.replace(' Summary', '_Summary')}.md" with open(os.path.join(summaries_directory, summary_filename), 'r') as file: summaries_content += file.read() + "\n\n" # Prepare the context focus_context = "\n\n".join([doc.page_content for doc in focus_docs]) # Create the client and message client = anthropic.Anthropic(api_key=anthropic_api_key) response = client.completions.create( model="claude-2", max_tokens_to_sample=1024, prompt=f"{input_text}\n\nFocus Document:\n{focus_context}\n\nSummaries:\n{summaries_content}" ) # Display the answer answer = response.completion display_placeholder.markdown(f"**Answer:**\n{answer}", unsafe_allow_html=True) # Streamlit app layout with tabs st.title("Climate Policy Analysis Tool") # API Key Input api_key = st.text_input("Enter your OpenAI API key:", type="password", key="openai_key") # Create tabs tab1, tab2, tab3, tab4, tab5 = st.tabs([ "Summary Generation", "Multi-Plan QA (Shared Vectorstore)", "Multi-Plan QA (Multi-Vectorstore)", "Plan Comparison Tool", "Plan Comparison with Long Context Model" ]) # First tab: Summary Generation with tab1: uploaded_file = st.file_uploader( "Upload a Climate Action Plan in PDF format", type="pdf", key="upload_file" ) prompt_file_path = "Prompts/summary_tool_system_prompt.md" questions_file_path = "Prompts/summary_tool_questions.md" if st.button("Generate", key="generate_button"): if not api_key: st.warning("Please provide your OpenAI API key.") elif not uploaded_file: st.warning("Please upload a PDF file.") else: display_placeholder = st.empty() with st.spinner("Processing..."): try: results = process_pdf( api_key, uploaded_file, questions_file_path, prompt_file_path, display_placeholder ) markdown_text = "\n".join(results) # Use the uploaded file's name for the download file base_name = os.path.splitext(uploaded_file.name)[0] download_file_name = f"{base_name}_Summary.md" st.download_button( label="Download Results as Markdown", data=markdown_text, file_name=download_file_name, mime="text/markdown", key="download_button" ) except Exception as e: st.error(f"An error occurred: {e}") # Second tab: Multi-Plan QA (Shared Vectorstore) with tab2: input_text = st.text_input("Ask a question:", key="multi_plan_input") if st.button("Ask", key="multi_plan_qa_button"): if not api_key: st.warning("Please provide your OpenAI API key.") elif not input_text: st.warning("Please enter a question.") else: display_placeholder2 = st.empty() with st.spinner("Processing..."): try: process_multi_plan_qa( api_key, input_text, display_placeholder2 ) except Exception as e: st.error(f"An error occurred: {e}") # Third tab: Multi-Plan QA (Multi-Vectorstore) with tab3: user_input = st.text_input("Ask a question:", key="multi_vectorstore_input") if st.button("Ask", key="multi_vectorstore_qa_button"): if not api_key: st.warning("Please provide your OpenAI API key.") elif not user_input: st.warning("Please enter a question.") else: display_placeholder3 = st.empty() with st.spinner("Processing..."): try: process_multi_plan_qa_multi_vectorstore( api_key, user_input, display_placeholder3 ) except Exception as e: st.error(f"An error occurred: {e}") # Fourth tab: Plan Comparison Tool with tab4: st.header("Plan Comparison Tool") # List of documents from vector stores vectorstore_documents = list_vector_store_documents() # Option to upload a new plan or select from existing vector stores focus_option = st.radio( "Choose a focus plan:", ("Select from existing vector stores", "Upload a new plan"), key="focus_option" ) if focus_option == "Upload a new plan": focus_uploaded_file = st.file_uploader( "Upload a Climate Action Plan to compare", type="pdf", key="focus_upload" ) if focus_uploaded_file is not None: # Directly use the uploaded file focus_input = focus_uploaded_file else: focus_input = None else: # Select a focus plan from existing vector stores selected_focus_plan = st.selectbox( "Select a focus plan:", vectorstore_documents, key="select_focus_plan" ) focus_input = os.path.join( "Individual_All_Vectorstores", f"{selected_focus_plan.replace(' Summary', '_Summary')}_vectorstore" ) # Option to upload comparison documents or select from existing vector stores comparison_option = st.radio( "Choose comparison documents:", ("Select from existing vector stores", "Upload new documents"), key="comparison_option" ) if comparison_option == "Upload new documents": comparison_files = st.file_uploader( "Upload comparison documents", type="pdf", accept_multiple_files=True, key="comparison_files" ) comparison_inputs = comparison_files else: # Select comparison documents from existing vector stores selected_comparison_plans = st.multiselect( "Select comparison documents:", vectorstore_documents, key="select_comparison_plans" ) comparison_inputs = [ os.path.join( "Individual_All_Vectorstores", f"{doc.replace(' Summary', '_Summary')}_vectorstore" ) for doc in selected_comparison_plans ] input_text = st.text_input( "Ask a comparison question:", key="comparison_input" ) if st.button("Compare", key="compare_button"): if not api_key: st.warning("Please provide your OpenAI API key.") elif not input_text: st.warning("Please enter a comparison question.") elif not focus_input: st.warning("Please provide a focus plan.") elif not comparison_inputs: st.warning("Please provide comparison documents.") else: display_placeholder4 = st.empty() with st.spinner("Processing..."): try: # Call the process_one_to_many_query function process_one_to_many_query( api_key, focus_input, comparison_inputs, input_text, display_placeholder4 ) except Exception as e: st.error(f"An error occurred: {e}") # Fifth tab: Plan Comparison with Long Context Model with tab5: st.header("Plan Comparison with Long Context Model") # Anthropics API Key Input anthropic_api_key = st.text_input( "Enter your Anthropic API key:", type="password", key="anthropic_key" ) # Option to upload a new plan or select from a list focus_option = st.radio( "Choose a focus plan:", ("Select from existing plans", "Upload a new plan"), key="focus_option_long_context" ) if focus_option == "Upload a new plan": focus_uploaded_file = st.file_uploader( "Upload a Climate Action Plan to compare", type="pdf", key="focus_upload_long_context" ) if focus_uploaded_file is not None: # Directly use the uploaded file focus_plan_path = focus_uploaded_file else: focus_plan_path = None else: # List of existing plans in CAPS plan_list = [f.replace(".pdf", "") for f in os.listdir("CAPS") if f.endswith('.pdf')] selected_focus_plan = st.selectbox( "Select a focus plan:", plan_list, key="select_focus_plan_long_context" ) focus_plan_path = os.path.join("CAPS", f"{selected_focus_plan}.pdf") # List available summary documents for selection summaries_directory = "CAPS_Summaries" summary_files = [ f.replace(".md", "").replace("_", " ") for f in os.listdir(summaries_directory) if f.endswith('.md') ] selected_summaries = st.multiselect( "Select summary documents for comparison:", summary_files, key="selected_summaries" ) input_text = st.text_input( "Ask a comparison question:", key="comparison_input_long_context" ) if st.button("Compare with Long Context", key="compare_button_long_context"): if not api_key: st.warning("Please provide your OpenAI API key.") elif not anthropic_api_key: st.warning("Please provide your Anthropic API key.") elif not input_text: st.warning("Please enter a comparison question.") elif not focus_plan_path: st.warning("Please provide a focus plan.") else: display_placeholder = st.empty() with st.spinner("Processing..."): try: compare_with_long_context( api_key, anthropic_api_key, input_text, focus_plan_path, selected_summaries, display_placeholder ) except Exception as e: st.error(f"An error occurred: {e}")