|
import os |
|
import re |
|
import streamlit as st |
|
from tempfile import NamedTemporaryFile |
|
import anthropic |
|
|
|
|
|
from langchain.chains import create_retrieval_chain |
|
from langchain.chains.combine_documents import create_stuff_documents_chain |
|
from langchain_core.prompts import ChatPromptTemplate |
|
from langchain_openai import ChatOpenAI, OpenAIEmbeddings |
|
from langchain_community.document_loaders import PyPDFLoader, TextLoader |
|
from langchain_community.vectorstores import FAISS |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
|
|
|
def remove_code_blocks(text): |
|
""" |
|
Removes code block markers from the answer text. |
|
|
|
Args: |
|
text (str): The text from which code block markers should be removed. |
|
|
|
Returns: |
|
str: The text without code block markers. |
|
""" |
|
code_block_pattern = r"^```(?:\w+)?\n(.*?)\n```$" |
|
match = re.match(code_block_pattern, text, re.DOTALL) |
|
if match: |
|
return match.group(1).strip() |
|
else: |
|
return text |
|
|
|
|
|
def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder): |
|
""" |
|
Processes a PDF file, runs Q&A, and returns the results. |
|
|
|
Args: |
|
api_key (str): OpenAI API key. |
|
uploaded_file: Uploaded PDF file. |
|
questions_path (str): Path to the questions file. |
|
prompt_path (str): Path to the system prompt file. |
|
display_placeholder: Streamlit placeholder for displaying results. |
|
|
|
Returns: |
|
list: List of QA results. |
|
""" |
|
|
|
os.environ["OPENAI_API_KEY"] = api_key |
|
|
|
|
|
with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: |
|
temp_pdf.write(uploaded_file.read()) |
|
temp_pdf_path = temp_pdf.name |
|
|
|
|
|
loader = PyPDFLoader(temp_pdf_path) |
|
docs = loader.load() |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) |
|
splits = text_splitter.split_documents(docs) |
|
|
|
|
|
vectorstore = FAISS.from_documents( |
|
documents=splits, |
|
embedding=OpenAIEmbeddings(model="text-embedding-3-large") |
|
) |
|
retriever = vectorstore.as_retriever(search_kwargs={"k": 10}) |
|
|
|
|
|
if os.path.exists(prompt_path): |
|
with open(prompt_path, "r") as file: |
|
system_prompt = file.read() |
|
else: |
|
raise FileNotFoundError(f"The specified file was not found: {prompt_path}") |
|
|
|
|
|
prompt = ChatPromptTemplate.from_messages( |
|
[ |
|
("system", system_prompt), |
|
("human", "{input}"), |
|
] |
|
) |
|
|
|
|
|
llm = ChatOpenAI(model="gpt-4o") |
|
|
|
|
|
question_answer_chain = create_stuff_documents_chain( |
|
llm, prompt, document_variable_name="context" |
|
) |
|
rag_chain = create_retrieval_chain(retriever, question_answer_chain) |
|
|
|
|
|
if os.path.exists(questions_path): |
|
with open(questions_path, "r") as file: |
|
questions = [line.strip() for line in file.readlines() if line.strip()] |
|
else: |
|
raise FileNotFoundError(f"The specified file was not found: {questions_path}") |
|
|
|
|
|
qa_results = [] |
|
for question in questions: |
|
result = rag_chain.invoke({"input": question}) |
|
answer = result["answer"] |
|
|
|
|
|
answer = remove_code_blocks(answer) |
|
|
|
qa_text = f"### Question: {question}\n**Answer:**\n{answer}\n" |
|
qa_results.append(qa_text) |
|
display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True) |
|
|
|
|
|
os.remove(temp_pdf_path) |
|
|
|
return qa_results |
|
|
|
|
|
def process_multi_plan_qa(api_key, input_text, display_placeholder): |
|
""" |
|
Performs multi-plan QA using an existing shared vector store. |
|
|
|
Args: |
|
api_key (str): OpenAI API key. |
|
input_text (str): The question to ask. |
|
display_placeholder: Streamlit placeholder for displaying results. |
|
""" |
|
|
|
os.environ["OPENAI_API_KEY"] = api_key |
|
|
|
|
|
embeddings = OpenAIEmbeddings(model="text-embedding-3-large") |
|
vector_store = FAISS.load_local( |
|
"Combined_Summary_Vectorstore", |
|
embeddings, |
|
allow_dangerous_deserialization=True |
|
) |
|
|
|
|
|
retriever = vector_store.as_retriever(search_kwargs={"k": 50}) |
|
|
|
|
|
prompt_path = "Prompts/multi_document_qa_system_prompt.md" |
|
if os.path.exists(prompt_path): |
|
with open(prompt_path, "r") as file: |
|
system_prompt = file.read() |
|
else: |
|
raise FileNotFoundError(f"The specified file was not found: {prompt_path}") |
|
|
|
|
|
prompt = ChatPromptTemplate.from_messages( |
|
[ |
|
("system", system_prompt), |
|
("human", "{input}"), |
|
] |
|
) |
|
|
|
|
|
llm = ChatOpenAI(model="gpt-4o") |
|
question_answer_chain = create_stuff_documents_chain( |
|
llm, prompt, document_variable_name="context" |
|
) |
|
rag_chain = create_retrieval_chain(retriever, question_answer_chain) |
|
|
|
|
|
result = rag_chain.invoke({"input": input_text}) |
|
answer = result["answer"] |
|
|
|
|
|
display_placeholder.markdown(f"**Answer:**\n{answer}") |
|
|
|
|
|
def process_multi_plan_qa_multi_vectorstore(api_key, input_text, display_placeholder): |
|
""" |
|
Performs multi-plan QA using multiple individual vector stores. |
|
|
|
Args: |
|
api_key (str): OpenAI API key. |
|
input_text (str): The question to ask. |
|
display_placeholder: Streamlit placeholder for displaying results. |
|
""" |
|
|
|
os.environ["OPENAI_API_KEY"] = api_key |
|
|
|
|
|
vectorstore_directory = "Individual_Summary_Vectorstores" |
|
|
|
|
|
vectorstore_names = [ |
|
d for d in os.listdir(vectorstore_directory) |
|
if os.path.isdir(os.path.join(vectorstore_directory, d)) |
|
] |
|
|
|
|
|
all_retrieved_chunks = [] |
|
|
|
|
|
for vectorstore_name in vectorstore_names: |
|
vectorstore_path = os.path.join(vectorstore_directory, vectorstore_name) |
|
|
|
|
|
embeddings = OpenAIEmbeddings(model="text-embedding-3-large") |
|
vector_store = FAISS.load_local( |
|
vectorstore_path, |
|
embeddings, |
|
allow_dangerous_deserialization=True |
|
) |
|
|
|
|
|
retriever = vector_store.as_retriever(search_kwargs={"k": 2}) |
|
|
|
|
|
retrieved_chunks = retriever.invoke(input_text) |
|
all_retrieved_chunks.extend(retrieved_chunks) |
|
|
|
|
|
prompt_path = "Prompts/multi_document_qa_system_prompt.md" |
|
if os.path.exists(prompt_path): |
|
with open(prompt_path, "r") as file: |
|
system_prompt = file.read() |
|
else: |
|
raise FileNotFoundError(f"The specified file was not found: {prompt_path}") |
|
|
|
|
|
prompt = ChatPromptTemplate.from_messages( |
|
[ |
|
("system", system_prompt), |
|
("human", "{input}"), |
|
] |
|
) |
|
|
|
|
|
llm = ChatOpenAI(model="gpt-4o") |
|
question_answer_chain = create_stuff_documents_chain( |
|
llm, prompt, document_variable_name="context" |
|
) |
|
|
|
|
|
result = question_answer_chain.invoke({ |
|
"input": input_text, |
|
"context": all_retrieved_chunks |
|
}) |
|
|
|
|
|
answer = result["answer"] if "answer" in result else result |
|
display_placeholder.markdown(f"**Answer:**\n{answer}") |
|
|
|
def load_documents_from_pdf(file): |
|
""" |
|
Loads documents from a PDF file. |
|
|
|
Args: |
|
file: Uploaded PDF file. |
|
|
|
Returns: |
|
list: List of documents. |
|
""" |
|
|
|
if not file.name.endswith('.pdf'): |
|
raise ValueError("The uploaded file is not a PDF. Please upload a PDF file.") |
|
|
|
with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: |
|
temp_pdf.write(file.read()) |
|
temp_pdf_path = temp_pdf.name |
|
|
|
loader = PyPDFLoader(temp_pdf_path) |
|
docs = loader.load() |
|
os.remove(temp_pdf_path) |
|
return docs |
|
|
|
def load_vector_store_from_path(path): |
|
""" |
|
Loads a vector store from a given path. |
|
|
|
Args: |
|
path (str): Path to the vector store. |
|
|
|
Returns: |
|
FAISS: Loaded vector store. |
|
""" |
|
embeddings = OpenAIEmbeddings(model="text-embedding-3-large") |
|
return FAISS.load_local( |
|
path, |
|
embeddings, |
|
allow_dangerous_deserialization=True |
|
) |
|
|
|
|
|
def process_one_to_many_query(api_key, focus_input, comparison_inputs, input_text, display_placeholder): |
|
""" |
|
Compares a focus document against multiple comparison documents using a one-to-many query approach. |
|
|
|
Args: |
|
api_key (str): OpenAI API key. |
|
focus_input: Focus document (uploaded file or path to vector store). |
|
comparison_inputs: List of comparison documents (uploaded files or paths to vector stores). |
|
input_text (str): The comparison question to ask. |
|
display_placeholder: Streamlit placeholder for displaying results. |
|
""" |
|
|
|
os.environ["OPENAI_API_KEY"] = api_key |
|
print(comparison_inputs) |
|
|
|
if isinstance(focus_input, st.runtime.uploaded_file_manager.UploadedFile): |
|
|
|
focus_docs = load_documents_from_pdf(focus_input) |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) |
|
focus_splits = text_splitter.split_documents(focus_docs) |
|
focus_vector_store = FAISS.from_documents( |
|
focus_splits, |
|
OpenAIEmbeddings(model="text-embedding-3-large") |
|
) |
|
focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5}) |
|
elif isinstance(focus_input, str) and os.path.isdir(focus_input): |
|
|
|
focus_vector_store = load_vector_store_from_path(focus_input) |
|
focus_retriever = focus_vector_store.as_retriever(search_kwargs={"k": 5}) |
|
else: |
|
raise ValueError("Invalid focus input type. Must be a PDF file or a path to a vector store.") |
|
|
|
|
|
focus_docs = focus_retriever.invoke(input_text) |
|
|
|
|
|
comparison_chunks = [] |
|
for comparison_input in comparison_inputs: |
|
if isinstance(comparison_input, st.runtime.uploaded_file_manager.UploadedFile): |
|
|
|
comparison_docs = load_documents_from_pdf(comparison_input) |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500) |
|
comparison_splits = text_splitter.split_documents(comparison_docs) |
|
comparison_vector_store = FAISS.from_documents( |
|
comparison_splits, |
|
OpenAIEmbeddings(model="text-embedding-3-large") |
|
) |
|
comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5}) |
|
elif isinstance(comparison_input, str) and os.path.isdir(comparison_input): |
|
|
|
comparison_vector_store = load_vector_store_from_path(comparison_input) |
|
comparison_retriever = comparison_vector_store.as_retriever(search_kwargs={"k": 5}) |
|
else: |
|
raise ValueError("Invalid comparison input type. Must be a PDF file or a path to a vector store.") |
|
|
|
|
|
comparison_docs = comparison_retriever.invoke(input_text) |
|
comparison_chunks.extend(comparison_docs) |
|
|
|
|
|
combined_context = focus_docs + comparison_chunks |
|
|
|
|
|
prompt_path = "Prompts/comparison_prompt.md" |
|
if os.path.exists(prompt_path): |
|
with open(prompt_path, "r") as file: |
|
system_prompt = file.read() |
|
else: |
|
raise FileNotFoundError(f"The specified file was not found: {prompt_path}") |
|
|
|
|
|
prompt = ChatPromptTemplate.from_messages( |
|
[ |
|
("system", system_prompt), |
|
("human", "{input}") |
|
] |
|
) |
|
|
|
|
|
llm = ChatOpenAI(model="gpt-4o") |
|
question_answer_chain = create_stuff_documents_chain( |
|
llm, |
|
prompt, |
|
document_variable_name="context" |
|
) |
|
|
|
|
|
result = question_answer_chain.invoke({ |
|
"context": combined_context, |
|
"input": input_text |
|
}) |
|
|
|
|
|
answer = result["answer"] if "answer" in result else result |
|
display_placeholder.markdown(f"**Answer:**\n{answer}") |
|
|
|
|
|
def list_vector_store_documents(): |
|
""" |
|
Lists available vector store documents. |
|
|
|
Returns: |
|
list: List of document names. |
|
""" |
|
|
|
directory_path = "Individual_All_Vectorstores" |
|
if not os.path.exists(directory_path): |
|
raise FileNotFoundError( |
|
f"The directory '{directory_path}' does not exist. " |
|
"Run `create_and_save_individual_vector_stores()` to create it." |
|
) |
|
|
|
documents = [ |
|
f.replace("_vectorstore", "").replace("_", " ") |
|
for f in os.listdir(directory_path) |
|
if f.endswith("_vectorstore") |
|
] |
|
return documents |
|
|
|
|
|
def compare_with_long_context(api_key, anthropic_api_key, input_text, focus_plan_path, selected_summaries, display_placeholder): |
|
""" |
|
Compares plans using a long context model. |
|
|
|
Args: |
|
api_key (str): OpenAI API key. |
|
anthropic_api_key (str): Anthropic API key. |
|
input_text (str): The comparison question to ask. |
|
focus_plan_path: Path to the focus plan or uploaded file. |
|
selected_summaries (list): List of selected summary documents. |
|
display_placeholder: Streamlit placeholder for displaying results. |
|
""" |
|
|
|
os.environ["OPENAI_API_KEY"] = api_key |
|
os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key |
|
|
|
|
|
if isinstance(focus_plan_path, st.runtime.uploaded_file_manager.UploadedFile): |
|
|
|
focus_docs = load_documents_from_pdf(focus_plan_path) |
|
elif isinstance(focus_plan_path, str): |
|
|
|
focus_loader = PyPDFLoader(focus_plan_path) |
|
focus_docs = focus_loader.load() |
|
else: |
|
raise ValueError("Invalid focus plan input type. Must be an uploaded file or a file path.") |
|
|
|
|
|
summaries_directory = "CAPS_Summaries" |
|
summaries_content = "" |
|
for filename in selected_summaries: |
|
|
|
summary_filename = f"{filename.replace(' Summary', '_Summary')}.md" |
|
with open(os.path.join(summaries_directory, summary_filename), 'r') as file: |
|
summaries_content += file.read() + "\n\n" |
|
|
|
|
|
focus_context = "\n\n".join([doc.page_content for doc in focus_docs]) |
|
|
|
|
|
client = anthropic.Anthropic(api_key=anthropic_api_key) |
|
response = client.completions.create( |
|
model="claude-2", |
|
max_tokens_to_sample=1024, |
|
prompt=f"{input_text}\n\nFocus Document:\n{focus_context}\n\nSummaries:\n{summaries_content}" |
|
) |
|
|
|
|
|
answer = response.completion |
|
display_placeholder.markdown(f"**Answer:**\n{answer}", unsafe_allow_html=True) |
|
|
|
|
|
st.title("Climate Policy Analysis Tool") |
|
|
|
|
|
api_key = st.text_input("Enter your OpenAI API key:", type="password", key="openai_key") |
|
|
|
|
|
tab1, tab2, tab3, tab4, tab5 = st.tabs([ |
|
"Summary Generation", |
|
"Multi-Plan QA (Shared Vectorstore)", |
|
"Multi-Plan QA (Multi-Vectorstore)", |
|
"Plan Comparison Tool", |
|
"Plan Comparison with Long Context Model" |
|
]) |
|
|
|
|
|
with tab1: |
|
uploaded_file = st.file_uploader( |
|
"Upload a Climate Action Plan in PDF format", |
|
type="pdf", |
|
key="upload_file" |
|
) |
|
|
|
prompt_file_path = "Prompts/summary_tool_system_prompt.md" |
|
questions_file_path = "Prompts/summary_tool_questions.md" |
|
|
|
if st.button("Generate", key="generate_button"): |
|
if not api_key: |
|
st.warning("Please provide your OpenAI API key.") |
|
elif not uploaded_file: |
|
st.warning("Please upload a PDF file.") |
|
else: |
|
display_placeholder = st.empty() |
|
with st.spinner("Processing..."): |
|
try: |
|
results = process_pdf( |
|
api_key, |
|
uploaded_file, |
|
questions_file_path, |
|
prompt_file_path, |
|
display_placeholder |
|
) |
|
markdown_text = "\n".join(results) |
|
|
|
|
|
base_name = os.path.splitext(uploaded_file.name)[0] |
|
download_file_name = f"{base_name}_Summary.md" |
|
|
|
st.download_button( |
|
label="Download Results as Markdown", |
|
data=markdown_text, |
|
file_name=download_file_name, |
|
mime="text/markdown", |
|
key="download_button" |
|
) |
|
except Exception as e: |
|
st.error(f"An error occurred: {e}") |
|
|
|
|
|
with tab2: |
|
input_text = st.text_input("Ask a question:", key="multi_plan_input") |
|
if st.button("Ask", key="multi_plan_qa_button"): |
|
if not api_key: |
|
st.warning("Please provide your OpenAI API key.") |
|
elif not input_text: |
|
st.warning("Please enter a question.") |
|
else: |
|
display_placeholder2 = st.empty() |
|
with st.spinner("Processing..."): |
|
try: |
|
process_multi_plan_qa( |
|
api_key, |
|
input_text, |
|
display_placeholder2 |
|
) |
|
except Exception as e: |
|
st.error(f"An error occurred: {e}") |
|
|
|
|
|
with tab3: |
|
user_input = st.text_input("Ask a question:", key="multi_vectorstore_input") |
|
if st.button("Ask", key="multi_vectorstore_qa_button"): |
|
if not api_key: |
|
st.warning("Please provide your OpenAI API key.") |
|
elif not user_input: |
|
st.warning("Please enter a question.") |
|
else: |
|
display_placeholder3 = st.empty() |
|
with st.spinner("Processing..."): |
|
try: |
|
process_multi_plan_qa_multi_vectorstore( |
|
api_key, |
|
user_input, |
|
display_placeholder3 |
|
) |
|
except Exception as e: |
|
st.error(f"An error occurred: {e}") |
|
|
|
|
|
with tab4: |
|
st.header("Plan Comparison Tool") |
|
|
|
|
|
vectorstore_documents = list_vector_store_documents() |
|
|
|
|
|
focus_option = st.radio( |
|
"Choose a focus plan:", |
|
("Select from existing vector stores", "Upload a new plan"), |
|
key="focus_option" |
|
) |
|
|
|
if focus_option == "Upload a new plan": |
|
focus_uploaded_file = st.file_uploader( |
|
"Upload a Climate Action Plan to compare", |
|
type="pdf", |
|
key="focus_upload" |
|
) |
|
if focus_uploaded_file is not None: |
|
|
|
focus_input = focus_uploaded_file |
|
else: |
|
focus_input = None |
|
else: |
|
|
|
selected_focus_plan = st.selectbox( |
|
"Select a focus plan:", |
|
vectorstore_documents, |
|
key="select_focus_plan" |
|
) |
|
focus_input = os.path.join( |
|
"Individual_All_Vectorstores", |
|
f"{selected_focus_plan.replace(' Summary', '_Summary')}_vectorstore" |
|
) |
|
|
|
|
|
comparison_option = st.radio( |
|
"Choose comparison documents:", |
|
("Select from existing vector stores", "Upload new documents"), |
|
key="comparison_option" |
|
) |
|
|
|
if comparison_option == "Upload new documents": |
|
comparison_files = st.file_uploader( |
|
"Upload comparison documents", |
|
type="pdf", |
|
accept_multiple_files=True, |
|
key="comparison_files" |
|
) |
|
comparison_inputs = comparison_files |
|
else: |
|
|
|
selected_comparison_plans = st.multiselect( |
|
"Select comparison documents:", |
|
vectorstore_documents, |
|
key="select_comparison_plans" |
|
) |
|
comparison_inputs = [ |
|
os.path.join( |
|
"Individual_All_Vectorstores", |
|
f"{doc.replace(' Summary', '_Summary')}_vectorstore" |
|
) for doc in selected_comparison_plans |
|
] |
|
|
|
input_text = st.text_input( |
|
"Ask a comparison question:", |
|
key="comparison_input" |
|
) |
|
|
|
if st.button("Compare", key="compare_button"): |
|
if not api_key: |
|
st.warning("Please provide your OpenAI API key.") |
|
elif not input_text: |
|
st.warning("Please enter a comparison question.") |
|
elif not focus_input: |
|
st.warning("Please provide a focus plan.") |
|
elif not comparison_inputs: |
|
st.warning("Please provide comparison documents.") |
|
else: |
|
display_placeholder4 = st.empty() |
|
with st.spinner("Processing..."): |
|
try: |
|
|
|
process_one_to_many_query( |
|
api_key, |
|
focus_input, |
|
comparison_inputs, |
|
input_text, |
|
display_placeholder4 |
|
) |
|
except Exception as e: |
|
st.error(f"An error occurred: {e}") |
|
|
|
|
|
with tab5: |
|
st.header("Plan Comparison with Long Context Model") |
|
|
|
|
|
anthropic_api_key = st.text_input( |
|
"Enter your Anthropic API key:", |
|
type="password", |
|
key="anthropic_key" |
|
) |
|
|
|
|
|
focus_option = st.radio( |
|
"Choose a focus plan:", |
|
("Select from existing plans", "Upload a new plan"), |
|
key="focus_option_long_context" |
|
) |
|
|
|
if focus_option == "Upload a new plan": |
|
focus_uploaded_file = st.file_uploader( |
|
"Upload a Climate Action Plan to compare", |
|
type="pdf", |
|
key="focus_upload_long_context" |
|
) |
|
if focus_uploaded_file is not None: |
|
|
|
focus_plan_path = focus_uploaded_file |
|
else: |
|
focus_plan_path = None |
|
else: |
|
|
|
plan_list = [f.replace(".pdf", "") for f in os.listdir("CAPS") if f.endswith('.pdf')] |
|
selected_focus_plan = st.selectbox( |
|
"Select a focus plan:", |
|
plan_list, |
|
key="select_focus_plan_long_context" |
|
) |
|
focus_plan_path = os.path.join("CAPS", f"{selected_focus_plan}.pdf") |
|
|
|
|
|
summaries_directory = "CAPS_Summaries" |
|
summary_files = [ |
|
f.replace(".md", "").replace("_", " ") |
|
for f in os.listdir(summaries_directory) if f.endswith('.md') |
|
] |
|
selected_summaries = st.multiselect( |
|
"Select summary documents for comparison:", |
|
summary_files, |
|
key="selected_summaries" |
|
) |
|
|
|
input_text = st.text_input( |
|
"Ask a comparison question:", |
|
key="comparison_input_long_context" |
|
) |
|
|
|
if st.button("Compare with Long Context", key="compare_button_long_context"): |
|
if not api_key: |
|
st.warning("Please provide your OpenAI API key.") |
|
elif not anthropic_api_key: |
|
st.warning("Please provide your Anthropic API key.") |
|
elif not input_text: |
|
st.warning("Please enter a comparison question.") |
|
elif not focus_plan_path: |
|
st.warning("Please provide a focus plan.") |
|
else: |
|
display_placeholder = st.empty() |
|
with st.spinner("Processing..."): |
|
try: |
|
compare_with_long_context( |
|
api_key, |
|
anthropic_api_key, |
|
input_text, |
|
focus_plan_path, |
|
selected_summaries, |
|
display_placeholder |
|
) |
|
except Exception as e: |
|
st.error(f"An error occurred: {e}") |
|
|