Spaces:

vanderbilt-dsi
/

climate-plan-summary-tool

Running

App Files Files Community

climate-plan-summary-tool / app.py

umangchaudhry

Update app.py

a93e85e verified 8 months ago

raw

history blame contribute delete

26.5 kB

	import os
	import re
	import streamlit as st
	from tempfile import NamedTemporaryFile
	import anthropic

	# Import necessary modules from LangChain
	from langchain.chains import create_retrieval_chain
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_openai import ChatOpenAI, OpenAIEmbeddings
	from langchain_community.document_loaders import PyPDFLoader, TextLoader
	from langchain_community.vectorstores import FAISS
	from langchain_text_splitters import RecursiveCharacterTextSplitter

	# Function to remove code block markers from the answer
	def remove_code_blocks(text):
	"""
	Removes code block markers from the answer text.

	Args:
	text (str): The text from which code block markers should be removed.

	Returns:
	str: The text without code block markers.
	"""
	code_block_pattern = r"^```(?:\w+)?\n(.*?)\n```$"
	match = re.match(code_block_pattern, text, re.DOTALL)
	if match:
	return match.group(1).strip()
	else:
	return text

	# Function to process PDF, run Q&A, and return results
	def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder):
	"""
	Processes a PDF file, runs Q&A, and returns the results.

	Args:
	api_key (str): OpenAI API key.
	uploaded_file: Uploaded PDF file.
	questions_path (str): Path to the questions file.
	prompt_path (str): Path to the system prompt file.
	display_placeholder: Streamlit placeholder for displaying results.

	Returns:
	list: List of QA results.
	"""
	# Set the OpenAI API key
	os.environ["OPENAI_API_KEY"] = api_key

	# Save the uploaded PDF to a temporary file
	with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
	temp_pdf.write(uploaded_file.read())
	temp_pdf_path = temp_pdf.name

	# Load and split the PDF into documents
	loader = PyPDFLoader(temp_pdf_path)
	docs = loader.load()
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
	splits = text_splitter.split_documents(docs)

	# Create a vector store from the documents
	vectorstore = FAISS.from_documents(
	documents=splits,
	embedding=OpenAIEmbeddings(model="text-embedding-3-large")
	)
	retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

	# Load the system prompt
	if os.path.exists(prompt_path):
	with open(prompt_path, "r") as file:
	system_prompt = file.read()
	else:
	raise FileNotFoundError(f"The specified file was not found: {prompt_path}")

	# Create the prompt template
	prompt = ChatPromptTemplate.from_messages(
	[
	("system", system_prompt),
	("human", "{input}"),
	]
	)

	# Initialize the language model
	llm = ChatOpenAI(model="gpt-4o")

	# Create the question-answering chain
	question_answer_chain = create_stuff_documents_chain(
	llm, prompt, document_variable_name="context"
	)
	rag_chain = create_retrieval_chain(retriever, question_answer_chain)

	# Load the questions
	if os.path.exists(questions_path):
	with open(questions_path, "r") as file:
	questions = [line.strip() for line in file.readlines() if line.strip()]
	else:
	raise FileNotFoundError(f"The specified file was not found: {questions_path}")

	# Process each question
	qa_results = []
	for question in questions:
	result = rag_chain.invoke({"input": question})
	answer = result["answer"]

	# Remove code block markers
	answer = remove_code_blocks(answer)

	qa_text = f"### Question: {question}\nAnswer:\n{answer}\n"
	qa_results.append(qa_text)
	display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True)

	# Clean up temporary PDF file
	os.remove(temp_pdf_path)

	return qa_results

	# Function to perform multi-plan QA using an existing vector store
	def process_multi_plan_qa(api_key, input_text, display_placeholder):
	"""
	Performs multi-plan QA using an existing shared vector store.

	Args:
	api_key (str): OpenAI API key.
	input_text (str): The question to ask.
	display_placeholder: Streamlit placeholder for displaying results.
	"""
	# Set the OpenAI API key
	os.environ["OPENAI_API_KEY"] = api_key

	# Load the existing vector store
	embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
	vector_store = FAISS.load_local(
	"Combined_Summary_Vectorstore",
	embeddings,
	allow_dangerous_deserialization=True
	)

	# Convert the vector store to a retriever
	retriever = vector_store.as_retriever(search_kwargs={"k": 50})

	# Read the system prompt for multi-document QA
	prompt_path = "Prompts/multi_document_qa_system_prompt.md"
	if os.path.exists(prompt_path):
	with open(prompt_path, "r") as file:
	system_prompt = file.read()
	else:
	raise FileNotFoundError(f"The specified file was not found: {prompt_path}")

	# Create the prompt template
	prompt = ChatPromptTemplate.from_messages(
	[
	("system", system_prompt),
	("human", "{input}"),
	]
	)

	# Create the question-answering chain
	llm = ChatOpenAI(model="gpt-4o")
	question_answer_chain = create_stuff_documents_chain(
	llm, prompt, document_variable_name="context"
	)
	rag_chain = create_retrieval_chain(retriever, question_answer_chain)

	# Process the input text
	result = rag_chain.invoke({"input": input_text})
	answer = result["answer"]

	# Display the answer
	display_placeholder.markdown(f"Answer:\n{answer}")

	# Function to perform multi-plan QA using multiple individual vector stores
	def process_multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder):
	"""
	Performs multi-plan QA using multiple individual vector stores.

	Args:
	api_key (str): OpenAI API key.
	input_text (str): The question to ask.
	display_placeholder: Streamlit placeholder for displaying results.
	"""
	# Set the OpenAI API key
	os.environ["OPENAI_API_KEY"] = api_key

	# Directory containing individual vector stores
	vectorstore_directory = "Individual_Summary_Vectorstores"

	# List all vector store directories
	vectorstore_names = [
	d for d in os.listdir(vectorstore_directory)
	if os.path.isdir(os.path.join(vectorstore_directory, d))
	]

	# Initialize a list to collect all retrieved chunks
	all_retrieved_chunks = []

	# Process each vector store
	for vectorstore_name in vectorstore_names:
	vectorstore_path = os.path.join(vectorstore_directory, vectorstore_name)

	# Load the vector store
	embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
	vector_store = FAISS.load_local(
	vectorstore_path,
	embeddings,
	allow_dangerous_deserialization=True
	)

	# Convert the vector store to a retriever
	retriever = vector_store.as_retriever(search_kwargs={"k": 2})

	# Retrieve relevant chunks for the input text
	retrieved_chunks = retriever.invoke(input_text)
	all_retrieved_chunks.extend(retrieved_chunks)

	# Read the system prompt for multi-document QA
	prompt_path = "Prompts/multi_document_qa_system_prompt.md"
	if os.path.exists(prompt_path):
	with open(prompt_path, "r") as file:
	system_prompt = file.read()
	else:
	raise FileNotFoundError(f"The specified file was not found: {prompt_path}")

	# Create the prompt template
	prompt = ChatPromptTemplate.from_messages(
	[
	("system", system_prompt),
	("human", "{input}"),
	]
	)

	# Create the question-answering chain
	llm = ChatOpenAI(model="gpt-4o")
	question_answer_chain = create_stuff_documents_chain(
	llm, prompt, document_variable_name="context"
	)

	# Process the combined context
	result = question_answer_chain.invoke({
	"input": input_text,
	"context": all_retrieved_chunks
	})

	# Display the answer
	answer = result["answer"] if "answer" in result else result
	display_placeholder.markdown(f"Answer:\n{answer}")

	def load_documents_from_pdf(file):
	"""
	Loads documents from a PDF file.

	Args:
	file: Uploaded PDF file.

	Returns:
	list: List of documents.
	"""
	# Check if the file is a PDF
	if not file.name.endswith('.pdf'):
	raise ValueError("The uploaded file is not a PDF. Please upload a PDF file.")

	with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
	temp_pdf.write(file.read())
	temp_pdf_path = temp_pdf.name

	loader = PyPDFLoader(temp_pdf_path)
	docs = loader.load()
	os.remove(temp_pdf_path)
	return docs

	def load_vector_store_from_path(path):
	"""
	Loads a vector store from a given path.

	Args:
	path (str): Path to the vector store.

	Returns:
	FAISS: Loaded vector store.
	"""
	embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
	return FAISS.load_local(
	path,
	embeddings,
	allow_dangerous_deserialization=True
	)

	# Function to compare documents via one-to-many query approach
	def process_one_to_many_query(api_key, focus_input, comparison_inputs, input_text, display_placeholder):
	"""
	Compares a focus document against multiple comparison documents using a one-to-many query approach.

	Args:
	api_key (str): OpenAI API key.
	focus_input: Focus document (uploaded file or path to vector store).
	comparison_inputs: List of comparison documents (uploaded files or paths to vector stores).
	input_text (str): The comparison question to ask.
	display_placeholder: Streamlit placeholder for displaying results.
	"""
	# Set the OpenAI API key
	os.environ["OPENAI_API_KEY"] = api_key
	print(comparison_inputs)
	# Load focus documents or vector store
	if isinstance(focus_input, st.runtime.uploaded_file_manager.UploadedFile):
	# If focus_input is an uploaded PDF file
	focus_docs = load_documents_from_pdf(focus_input)
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
	focus_splits = text_splitter.split_documents(focus_docs)
	focus_vector_store = FAISS.from_documents(
	focus_splits,
	OpenAIEmbeddings(model="text-embedding-3-large")
	)
	focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
	elif isinstance(focus_input, str) and os.path.isdir(focus_input):
	# If focus_input is a path to a vector store
	focus_vector_store = load_vector_store_from_path(focus_input)
	focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5})
	else:
	raise ValueError("Invalid focus input type. Must be a PDF file or a path to a vector store.")

	# Retrieve relevant chunks from the focus document
	focus_docs = focus_retriever.invoke(input_text)

	# Initialize list to collect comparison chunks
	comparison_chunks = []
	for comparison_input in comparison_inputs:
	if isinstance(comparison_input, st.runtime.uploaded_file_manager.UploadedFile):
	# If comparison_input is an uploaded PDF file
	comparison_docs = load_documents_from_pdf(comparison_input)
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
	comparison_splits = text_splitter.split_documents(comparison_docs)
	comparison_vector_store = FAISS.from_documents(
	comparison_splits,
	OpenAIEmbeddings(model="text-embedding-3-large")
	)
	comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
	elif isinstance(comparison_input, str) and os.path.isdir(comparison_input):
	# If comparison_input is a path to a vector store
	comparison_vector_store = load_vector_store_from_path(comparison_input)
	comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5})
	else:
	raise ValueError("Invalid comparison input type. Must be a PDF file or a path to a vector store.")

	# Retrieve relevant chunks from the comparison document
	comparison_docs = comparison_retriever.invoke(input_text)
	comparison_chunks.extend(comparison_docs)

	# Construct the combined context
	combined_context = focus_docs + comparison_chunks

	# Read the system prompt
	prompt_path = "Prompts/comparison_prompt.md"
	if os.path.exists(prompt_path):
	with open(prompt_path, "r") as file:
	system_prompt = file.read()
	else:
	raise FileNotFoundError(f"The specified file was not found: {prompt_path}")

	# Create the prompt template
	prompt = ChatPromptTemplate.from_messages(
	[
	("system", system_prompt),
	("human", "{input}")
	]
	)

	# Create the question-answering chain
	llm = ChatOpenAI(model="gpt-4o")
	question_answer_chain = create_stuff_documents_chain(
	llm,
	prompt,
	document_variable_name="context"
	)

	# Process the combined context
	result = question_answer_chain.invoke({
	"context": combined_context,
	"input": input_text
	})

	# Display the answer
	answer = result["answer"] if "answer" in result else result
	display_placeholder.markdown(f"Answer:\n{answer}")

	# Function to list vector store documents
	def list_vector_store_documents():
	"""
	Lists available vector store documents.

	Returns:
	list: List of document names.
	"""
	# Assuming documents are stored in the "Individual_All_Vectorstores" directory
	directory_path = "Individual_All_Vectorstores"
	if not os.path.exists(directory_path):
	raise FileNotFoundError(
	f"The directory '{directory_path}' does not exist. "
	"Run `create_and_save_individual_vector_stores()` to create it."
	)
	# List all available vector stores by document name
	documents = [
	f.replace("_vectorstore", "").replace("_", " ")
	for f in os.listdir(directory_path)
	if f.endswith("_vectorstore")
	]
	return documents

	# Function to compare plans using a long context model
	def compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan_path, selected_summaries, display_placeholder):
	"""
	Compares plans using a long context model.

	Args:
	api_key (str): OpenAI API key.
	anthropic_api_key (str): Anthropic API key.
	input_text (str): The comparison question to ask.
	focus_plan_path: Path to the focus plan or uploaded file.
	selected_summaries (list): List of selected summary documents.
	display_placeholder: Streamlit placeholder for displaying results.
	"""
	# Set the API keys
	os.environ["OPENAI_API_KEY"] = api_key
	os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key

	# Load focus documents
	if isinstance(focus_plan_path, st.runtime.uploaded_file_manager.UploadedFile):
	# If focus_plan_path is an uploaded file
	focus_docs = load_documents_from_pdf(focus_plan_path)
	elif isinstance(focus_plan_path, str):
	# If focus_plan_path is a file path
	focus_loader = PyPDFLoader(focus_plan_path)
	focus_docs = focus_loader.load()
	else:
	raise ValueError("Invalid focus plan input type. Must be an uploaded file or a file path.")

	# Concatenate selected summary documents
	summaries_directory = "CAPS_Summaries"
	summaries_content = ""
	for filename in selected_summaries:
	# Fix the filename by replacing ' Summary' with '_Summary'
	summary_filename = f"{filename.replace(' Summary', '_Summary')}.md"
	with open(os.path.join(summaries_directory, summary_filename), 'r') as file:
	summaries_content += file.read() + "\n\n"

	# Prepare the context
	focus_context = "\n\n".join([doc.page_content for doc in focus_docs])

	# Create the client and message
	client = anthropic.Anthropic(api_key=anthropic_api_key)
	response = client.completions.create(
	model="claude-2",
	max_tokens_to_sample=1024,
	prompt=f"{input_text}\n\nFocus Document:\n{focus_context}\n\nSummaries:\n{summaries_content}"
	)

	# Display the answer
	answer = response.completion
	display_placeholder.markdown(f"Answer:\n{answer}", unsafe_allow_html=True)

	# Streamlit app layout with tabs
	st.title("Climate Policy Analysis Tool")

	# API Key Input
	api_key = st.text_input("Enter your OpenAI API key:", type="password", key="openai_key")

	# Create tabs
	tab1, tab2, tab3, tab4, tab5 = st.tabs([
	"Summary Generation",
	"Multi-Plan QA (Shared Vectorstore)",
	"Multi-Plan QA (Multi-Vectorstore)",
	"Plan Comparison Tool",
	"Plan Comparison with Long Context Model"
	])

	# First tab: Summary Generation
	with tab1:
	uploaded_file = st.file_uploader(
	"Upload a Climate Action Plan in PDF format",
	type="pdf",
	key="upload_file"
	)

	prompt_file_path = "Prompts/summary_tool_system_prompt.md"
	questions_file_path = "Prompts/summary_tool_questions.md"

	if st.button("Generate", key="generate_button"):
	if not api_key:
	st.warning("Please provide your OpenAI API key.")
	elif not uploaded_file:
	st.warning("Please upload a PDF file.")
	else:
	display_placeholder = st.empty()
	with st.spinner("Processing..."):
	try:
	results = process_pdf(
	api_key,
	uploaded_file,
	questions_file_path,
	prompt_file_path,
	display_placeholder
	)
	markdown_text = "\n".join(results)

	# Use the uploaded file's name for the download file
	base_name = os.path.splitext(uploaded_file.name)[0]
	download_file_name = f"{base_name}_Summary.md"

	st.download_button(
	label="Download Results as Markdown",
	data=markdown_text,
	file_name=download_file_name,
	mime="text/markdown",
	key="download_button"
	)
	except Exception as e:
	st.error(f"An error occurred: {e}")

	# Second tab: Multi-Plan QA (Shared Vectorstore)
	with tab2:
	input_text = st.text_input("Ask a question:", key="multi_plan_input")
	if st.button("Ask", key="multi_plan_qa_button"):
	if not api_key:
	st.warning("Please provide your OpenAI API key.")
	elif not input_text:
	st.warning("Please enter a question.")
	else:
	display_placeholder2 = st.empty()
	with st.spinner("Processing..."):
	try:
	process_multi_plan_qa(
	api_key,
	input_text,
	display_placeholder2
	)
	except Exception as e:
	st.error(f"An error occurred: {e}")

	# Third tab: Multi-Plan QA (Multi-Vectorstore)
	with tab3:
	user_input = st.text_input("Ask a question:", key="multi_vectorstore_input")
	if st.button("Ask", key="multi_vectorstore_qa_button"):
	if not api_key:
	st.warning("Please provide your OpenAI API key.")
	elif not user_input:
	st.warning("Please enter a question.")
	else:
	display_placeholder3 = st.empty()
	with st.spinner("Processing..."):
	try:
	process_multi_plan_qa_multi_vectorstore(
	api_key,
	user_input,
	display_placeholder3
	)
	except Exception as e:
	st.error(f"An error occurred: {e}")

	# Fourth tab: Plan Comparison Tool
	with tab4:
	st.header("Plan Comparison Tool")

	# List of documents from vector stores
	vectorstore_documents = list_vector_store_documents()

	# Option to upload a new plan or select from existing vector stores
	focus_option = st.radio(
	"Choose a focus plan:",
	("Select from existing vector stores", "Upload a new plan"),
	key="focus_option"
	)

	if focus_option == "Upload a new plan":
	focus_uploaded_file = st.file_uploader(
	"Upload a Climate Action Plan to compare",
	type="pdf",
	key="focus_upload"
	)
	if focus_uploaded_file is not None:
	# Directly use the uploaded file
	focus_input = focus_uploaded_file
	else:
	focus_input = None
	else:
	# Select a focus plan from existing vector stores
	selected_focus_plan = st.selectbox(
	"Select a focus plan:",
	vectorstore_documents,
	key="select_focus_plan"
	)
	focus_input = os.path.join(
	"Individual_All_Vectorstores",
	f"{selected_focus_plan.replace(' Summary', '_Summary')}_vectorstore"
	)

	# Option to upload comparison documents or select from existing vector stores
	comparison_option = st.radio(
	"Choose comparison documents:",
	("Select from existing vector stores", "Upload new documents"),
	key="comparison_option"
	)

	if comparison_option == "Upload new documents":
	comparison_files = st.file_uploader(
	"Upload comparison documents",
	type="pdf",
	accept_multiple_files=True,
	key="comparison_files"
	)
	comparison_inputs = comparison_files
	else:
	# Select comparison documents from existing vector stores
	selected_comparison_plans = st.multiselect(
	"Select comparison documents:",
	vectorstore_documents,
	key="select_comparison_plans"
	)
	comparison_inputs = [
	os.path.join(
	"Individual_All_Vectorstores",
	f"{doc.replace(' Summary', '_Summary')}_vectorstore"
	) for doc in selected_comparison_plans
	]

	input_text = st.text_input(
	"Ask a comparison question:",
	key="comparison_input"
	)

	if st.button("Compare", key="compare_button"):
	if not api_key:
	st.warning("Please provide your OpenAI API key.")
	elif not input_text:
	st.warning("Please enter a comparison question.")
	elif not focus_input:
	st.warning("Please provide a focus plan.")
	elif not comparison_inputs:
	st.warning("Please provide comparison documents.")
	else:
	display_placeholder4 = st.empty()
	with st.spinner("Processing..."):
	try:
	# Call the process_one_to_many_query function
	process_one_to_many_query(
	api_key,
	focus_input,
	comparison_inputs,
	input_text,
	display_placeholder4
	)
	except Exception as e:
	st.error(f"An error occurred: {e}")

	# Fifth tab: Plan Comparison with Long Context Model
	with tab5:
	st.header("Plan Comparison with Long Context Model")

	# Anthropics API Key Input
	anthropic_api_key = st.text_input(
	"Enter your Anthropic API key:",
	type="password",
	key="anthropic_key"
	)

	# Option to upload a new plan or select from a list
	focus_option = st.radio(
	"Choose a focus plan:",
	("Select from existing plans", "Upload a new plan"),
	key="focus_option_long_context"
	)

	if focus_option == "Upload a new plan":
	focus_uploaded_file = st.file_uploader(
	"Upload a Climate Action Plan to compare",
	type="pdf",
	key="focus_upload_long_context"
	)
	if focus_uploaded_file is not None:
	# Directly use the uploaded file
	focus_plan_path = focus_uploaded_file
	else:
	focus_plan_path = None
	else:
	# List of existing plans in CAPS
	plan_list = [f.replace(".pdf", "") for f in os.listdir("CAPS") if f.endswith('.pdf')]
	selected_focus_plan = st.selectbox(
	"Select a focus plan:",
	plan_list,
	key="select_focus_plan_long_context"
	)
	focus_plan_path = os.path.join("CAPS", f"{selected_focus_plan}.pdf")

	# List available summary documents for selection
	summaries_directory = "CAPS_Summaries"
	summary_files = [
	f.replace(".md", "").replace("_", " ")
	for f in os.listdir(summaries_directory) if f.endswith('.md')
	]
	selected_summaries = st.multiselect(
	"Select summary documents for comparison:",
	summary_files,
	key="selected_summaries"
	)

	input_text = st.text_input(
	"Ask a comparison question:",
	key="comparison_input_long_context"
	)

	if st.button("Compare with Long Context", key="compare_button_long_context"):
	if not api_key:
	st.warning("Please provide your OpenAI API key.")
	elif not anthropic_api_key:
	st.warning("Please provide your Anthropic API key.")
	elif not input_text:
	st.warning("Please enter a comparison question.")
	elif not focus_plan_path:
	st.warning("Please provide a focus plan.")
	else:
	display_placeholder = st.empty()
	with st.spinner("Processing..."):
	try:
	compare_with_long_context(
	api_key,
	anthropic_api_key,
	input_text,
	focus_plan_path,
	selected_summaries,
	display_placeholder
	)
	except Exception as e:
	st.error(f"An error occurred: {e}")