|
import os |
|
print(os.system("apt-get update && apt-get install -y poppler-utils")) |
|
os.system("apt-get update && apt-get install -y tesseract-ocr") |
|
|
|
import tempfile |
|
import streamlit as st |
|
from PIL import Image |
|
import pytesseract |
|
|
|
from pdf2image import convert_from_path |
|
import pypdf |
|
from dotenv import load_dotenv |
|
import time |
|
|
|
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage |
|
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder |
|
from langchain_core.output_parsers import StrOutputParser |
|
from langchain_together import Together |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
from langchain_community.vectorstores import FAISS |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
|
import langgraph |
|
from langgraph.graph import END |
|
from typing import List, Dict, Any, TypedDict, Optional |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
st.set_page_config( |
|
page_title="Document Q&A", |
|
page_icon="π", |
|
layout="wide", |
|
initial_sidebar_state="expanded" |
|
) |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
/* Base styles */ |
|
.main { |
|
background-color: #f8fafc; |
|
color: #333; |
|
padding: 1rem; |
|
} |
|
|
|
/* Sidebar styling */ |
|
[data-testid="stSidebar"] { |
|
background-color: #1e293b; |
|
color: #f8fafc; |
|
padding: 1rem; |
|
} |
|
|
|
/* Example questions */ |
|
.example-button { |
|
background-color: #7c3aed; |
|
color: white; |
|
border: none; |
|
border-radius: 0.5rem; |
|
padding: 0.75rem 1rem; |
|
margin-bottom: 0.75rem; |
|
cursor: pointer; |
|
text-align: left; |
|
display: block; |
|
width: 100%; |
|
font-size: 0.9rem; |
|
} |
|
|
|
/* Chat container */ |
|
.chat-container { |
|
min-height: 60vh; |
|
overflow-y: auto; |
|
padding: 1rem; |
|
background-color: white; |
|
border-radius: 0.5rem; |
|
border: 1px solid #e2e8f0; |
|
margin-bottom: 1rem; |
|
} |
|
|
|
/* Sidebar title */ |
|
.sidebar-title { |
|
color: #f8fafc; |
|
font-size: 1.2rem; |
|
font-weight: 600; |
|
margin-bottom: 1rem; |
|
padding-bottom: 0.5rem; |
|
border-bottom: 1px solid #475569; |
|
} |
|
|
|
/* File list */ |
|
.file-item { |
|
padding: 0.5rem; |
|
background-color: #334155; |
|
border-radius: 0.25rem; |
|
margin-bottom: 0.5rem; |
|
color: #f8fafc; |
|
} |
|
.file-name { |
|
font-weight: 500; |
|
} |
|
.file-type { |
|
font-size: 0.75rem; |
|
color: #cbd5e1; |
|
} |
|
|
|
/* Instructions */ |
|
.instructions { |
|
color: #cbd5e1; |
|
} |
|
.instructions ol { |
|
margin-left: 1.5rem; |
|
padding-left: 0; |
|
} |
|
.instructions li { |
|
margin-bottom: 0.5rem; |
|
} |
|
|
|
/* Divider */ |
|
.divider { |
|
height: 1px; |
|
background-color: #475569; |
|
margin: 1.5rem 0; |
|
} |
|
|
|
/* Override Streamlit button styles */ |
|
.stButton > button { |
|
background-color: #7c3aed; |
|
color: white; |
|
} |
|
|
|
/* Override Streamlit file uploader */ |
|
.stFileUploader > div > div { |
|
background-color: #334155; |
|
color: #f8fafc; |
|
border: 1px dashed #7c3aed; |
|
border-radius: 0.5rem; |
|
padding: 1rem; |
|
} |
|
|
|
/* Controls section */ |
|
.controls-section { |
|
margin-top: 1rem; |
|
} |
|
|
|
/* Control buttons */ |
|
.control-button { |
|
background-color: #7c3aed; |
|
color: white; |
|
border: none; |
|
border-radius: 0.25rem; |
|
padding: 0.5rem 1rem; |
|
margin-right: 0.5rem; |
|
margin-bottom: 0.5rem; |
|
cursor: pointer; |
|
} |
|
|
|
/* How to use section */ |
|
.how-to-use { |
|
margin-bottom: 1.5rem; |
|
} |
|
.how-to-use ol { |
|
margin-left: 1.5rem; |
|
padding-left: 0; |
|
} |
|
.how-to-use li { |
|
margin-bottom: 0.5rem; |
|
color: #f8fafc; |
|
} |
|
|
|
/* Input field */ |
|
.stTextInput > div > div > input { |
|
border: 1px solid #e2e8f0; |
|
border-radius: 0.5rem; |
|
padding: 0.75rem; |
|
font-size: 1rem; |
|
} |
|
|
|
/* Form styling */ |
|
[data-testid="stForm"] { |
|
border: none; |
|
padding: 0; |
|
} |
|
|
|
/* Hide Streamlit branding */ |
|
#MainMenu {visibility: hidden;} |
|
footer {visibility: hidden;} |
|
|
|
/* Chat messages */ |
|
.user-message { |
|
background-color: #f3f4f6; |
|
padding: 0.75rem; |
|
border-radius: 0.5rem; |
|
margin-bottom: 0.75rem; |
|
color: #1e293b; |
|
} |
|
|
|
.assistant-message { |
|
background-color: #f8fafc; |
|
padding: 0.75rem; |
|
border-radius: 0.5rem; |
|
margin-bottom: 0.75rem; |
|
border: 1px solid #e2e8f0; |
|
color: #1e293b; |
|
} |
|
|
|
/* Chat input container */ |
|
.chat-input-container { |
|
display: flex; |
|
align-items: center; |
|
background-color: white; |
|
border-radius: 0.5rem; |
|
padding: 0.5rem; |
|
border: 1px solid #e2e8f0; |
|
} |
|
|
|
/* Document status */ |
|
.document-status { |
|
padding: 0.5rem; |
|
border-radius: 0.5rem; |
|
margin-top: 0.5rem; |
|
font-size: 0.9rem; |
|
} |
|
|
|
.status-success { |
|
background-color: #dcfce7; |
|
color: #166534; |
|
} |
|
|
|
.status-waiting { |
|
background-color: #f3f4f6; |
|
color: #4b5563; |
|
} |
|
|
|
/* Tabs styling */ |
|
.stTabs [data-baseweb="tab-list"] { |
|
gap: 8px; |
|
} |
|
|
|
.stTabs [data-baseweb="tab"] { |
|
background-color: #f1f5f9; |
|
border-radius: 4px 4px 0 0; |
|
padding: 8px 16px; |
|
height: auto; |
|
} |
|
|
|
.stTabs [aria-selected="true"] { |
|
background-color: white !important; |
|
border-bottom: 2px solid #7c3aed !important; |
|
} |
|
|
|
/* Sidebar section headers */ |
|
.sidebar-section-header { |
|
color: #f8fafc; |
|
font-size: 1rem; |
|
font-weight: 600; |
|
margin-top: 1rem; |
|
margin-bottom: 0.5rem; |
|
} |
|
|
|
/* Sidebar file uploader label */ |
|
.sidebar-uploader-label { |
|
color: #f8fafc; |
|
font-size: 0.9rem; |
|
margin-bottom: 0.5rem; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
EXAMPLE_QUESTIONS = [ |
|
"How do the different topics in these documents relate to each other?", |
|
"What is the structure of this document?", |
|
"Can you analyze the writing style of this text?", |
|
"Extract all dates and events mentioned in the document", |
|
"What are the main arguments presented in this document?" |
|
] |
|
|
|
|
|
@st.cache_resource |
|
def get_llm(): |
|
return Together( |
|
model="deepseek-ai/DeepSeek-V3", |
|
temperature=0.7, |
|
max_tokens=1024 |
|
) |
|
|
|
|
|
@st.cache_resource |
|
def get_embeddings(): |
|
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
|
|
|
|
@st.cache_resource |
|
def get_text_splitter(): |
|
return RecursiveCharacterTextSplitter( |
|
chunk_size=1000, |
|
chunk_overlap=200 |
|
) |
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
pdf_reader = pypdf.PdfReader(pdf_file) |
|
text = "" |
|
for page in pdf_reader.pages: |
|
text += page.extract_text() or "" |
|
return text |
|
|
|
|
|
def extract_text_from_image(image_file): |
|
image = Image.open(image_file) |
|
text = pytesseract.image_to_string(image) |
|
return text |
|
|
|
|
|
def process_pdf_with_ocr(pdf_file): |
|
|
|
text = extract_text_from_pdf(pdf_file) |
|
|
|
|
|
if len(text.strip()) < 100: |
|
images = convert_from_path(pdf_file) |
|
text = "" |
|
for image in images: |
|
text += pytesseract.image_to_string(image) |
|
|
|
return text |
|
|
|
|
|
def process_uploaded_files(uploaded_files): |
|
all_texts = [] |
|
file_info = [] |
|
|
|
for file in uploaded_files: |
|
|
|
with tempfile.NamedTemporaryFile(delete=False) as temp_file: |
|
temp_file.write(file.getvalue()) |
|
temp_file_path = temp_file.name |
|
|
|
|
|
if file.name.lower().endswith('.pdf'): |
|
text = process_pdf_with_ocr(temp_file_path) |
|
file_type = "PDF" |
|
elif file.name.lower().endswith(('.png', '.jpg', '.jpeg')): |
|
text = extract_text_from_image(temp_file_path) |
|
file_type = "Image" |
|
elif file.name.lower().endswith(('.txt', '.md')): |
|
text = file.getvalue().decode('utf-8') |
|
file_type = "Text" |
|
else: |
|
text = f"Unsupported file format: {file.name}" |
|
file_type = "Unknown" |
|
|
|
all_texts.append(f"--- Content from {file.name} ---\n{text}") |
|
file_info.append({"name": file.name, "type": file_type}) |
|
|
|
|
|
os.unlink(temp_file_path) |
|
|
|
return "\n\n".join(all_texts), file_info |
|
|
|
|
|
def create_vectorstore(text): |
|
text_splitter = get_text_splitter() |
|
chunks = text_splitter.split_text(text) |
|
|
|
|
|
return FAISS.from_texts( |
|
texts=chunks, |
|
embedding=get_embeddings() |
|
) |
|
|
|
|
|
class GraphState(TypedDict): |
|
messages: List |
|
documents: List |
|
thinking: str |
|
|
|
|
|
def create_rag_agent(vectorstore): |
|
|
|
def retrieve(state: GraphState) -> GraphState: |
|
query = state["messages"][-1].content |
|
docs = vectorstore.similarity_search(query, k=5) |
|
return {"documents": docs, "messages": state["messages"], "thinking": state.get("thinking", "")} |
|
|
|
|
|
def generate(state: GraphState) -> GraphState: |
|
messages = state["messages"] |
|
documents = state["documents"] |
|
|
|
|
|
context = "\n\n".join([f"Document {i+1}:\n{doc.page_content}" for i, doc in enumerate(documents)]) |
|
|
|
|
|
thinking_prompt = ChatPromptTemplate.from_messages([ |
|
SystemMessage(content="You are an assistant that thinks step by step before answering."), |
|
MessagesPlaceholder(variable_name="messages"), |
|
SystemMessage(content=f"Here is relevant context from the knowledge base:\n{context}\n\nThink step by step about how to answer the query using this context.") |
|
]) |
|
|
|
thinking = thinking_prompt | get_llm() | StrOutputParser() |
|
thinking_result = thinking.invoke({"messages": messages}) |
|
|
|
|
|
answer_prompt = ChatPromptTemplate.from_messages([ |
|
SystemMessage(content="You are a helpful assistant that provides accurate information based on the given context."), |
|
MessagesPlaceholder(variable_name="messages"), |
|
SystemMessage(content=f"Here is relevant context from the knowledge base:\n{context}\n\nHere is your thinking process:\n{thinking_result}\n\nNow provide a clear and helpful answer based on this context and thinking.") |
|
]) |
|
|
|
answer = answer_prompt | get_llm() | StrOutputParser() |
|
response = answer.invoke({"messages": messages}) |
|
|
|
return { |
|
"messages": messages + [AIMessage(content=response)], |
|
"thinking": thinking_result, |
|
"documents": documents |
|
} |
|
|
|
|
|
from langgraph.graph import StateGraph |
|
workflow = StateGraph(GraphState) |
|
|
|
workflow.add_node("retrieve", retrieve) |
|
workflow.add_node("generate", generate) |
|
|
|
workflow.set_entry_point("retrieve") |
|
workflow.add_edge("retrieve", "generate") |
|
workflow.add_edge("generate", END) |
|
|
|
|
|
app = workflow.compile() |
|
|
|
return app |
|
|
|
|
|
def clear_session_state(): |
|
for key in list(st.session_state.keys()): |
|
del st.session_state[key] |
|
|
|
|
|
def main(): |
|
|
|
if "show_examples" not in st.session_state: |
|
st.session_state.show_examples = True |
|
|
|
|
|
if "messages" not in st.session_state: |
|
st.session_state.messages = [] |
|
|
|
|
|
if "thinking_history" not in st.session_state: |
|
st.session_state.thinking_history = [] |
|
|
|
|
|
with st.sidebar: |
|
st.markdown('<div class="sidebar-title">π Document Q&A</div>', unsafe_allow_html=True) |
|
|
|
st.markdown(""" |
|
<div class="how-to-use"> |
|
<ol> |
|
<li>Upload your documents using the form below</li> |
|
<li>Process the documents</li> |
|
<li>Ask questions about your documents</li> |
|
<li>View the AI's answers and thinking process</li> |
|
</ol> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown('<div class="sidebar-section-header">π Upload Documents</div>', unsafe_allow_html=True) |
|
st.markdown('<div class="sidebar-uploader-label">Select files to upload:</div>', unsafe_allow_html=True) |
|
|
|
|
|
uploaded_files = st.file_uploader("Upload documents", |
|
type=["pdf", "txt", "png", "jpg", "jpeg"], |
|
accept_multiple_files=True, |
|
label_visibility="collapsed") |
|
|
|
|
|
if uploaded_files: |
|
if st.button("Process Documents"): |
|
with st.spinner("Processing documents..."): |
|
|
|
progress_bar = st.progress(0) |
|
for i in range(100): |
|
time.sleep(0.01) |
|
progress_bar.progress(i + 1) |
|
|
|
|
|
text, file_info = process_uploaded_files(uploaded_files) |
|
st.session_state.vectorstore = create_vectorstore(text) |
|
st.session_state.documents_processed = True |
|
st.session_state.file_info = file_info |
|
|
|
|
|
st.success(f"β
Processed {len(uploaded_files)} documents successfully!") |
|
|
|
|
|
if "file_info" in st.session_state and st.session_state.file_info: |
|
st.markdown('<div class="divider"></div>', unsafe_allow_html=True) |
|
st.markdown('<div class="sidebar-section-header">π Document Information</div>', unsafe_allow_html=True) |
|
|
|
|
|
for i, file in enumerate(st.session_state.file_info): |
|
st.markdown(f""" |
|
<div class="file-item"> |
|
<div class="file-name">{file['name']}</div> |
|
<div class="file-type">{file['type']} file</div> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
if st.button("Remove All Documents"): |
|
if "vectorstore" in st.session_state: |
|
del st.session_state.vectorstore |
|
if "file_info" in st.session_state: |
|
del st.session_state.file_info |
|
if "documents_processed" in st.session_state: |
|
del st.session_state.documents_processed |
|
st.success("All documents removed!") |
|
st.rerun() |
|
|
|
|
|
st.markdown('<div class="divider"></div>', unsafe_allow_html=True) |
|
st.markdown('<div class="sidebar-section-header">βοΈ Controls</div>', unsafe_allow_html=True) |
|
|
|
|
|
if st.button("Clear Chat"): |
|
if "messages" in st.session_state: |
|
st.session_state.messages = [] |
|
if "thinking_history" in st.session_state: |
|
st.session_state.thinking_history = [] |
|
st.rerun() |
|
|
|
|
|
if st.button("Reset All"): |
|
clear_session_state() |
|
st.rerun() |
|
|
|
|
|
if st.button("Hide Examples" if st.session_state.show_examples else "Show Examples"): |
|
st.session_state.show_examples = not st.session_state.show_examples |
|
st.rerun() |
|
|
|
|
|
st.title("Document Q&A Assistant") |
|
|
|
|
|
if st.session_state.show_examples: |
|
st.markdown("### Example Questions") |
|
cols = st.columns(len(EXAMPLE_QUESTIONS)) |
|
for i, question in enumerate(EXAMPLE_QUESTIONS): |
|
with cols[i]: |
|
if st.button(question, key=f"example_{hash(question)}"): |
|
st.session_state.messages.append(HumanMessage(content=question)) |
|
|
|
|
|
if "vectorstore" in st.session_state: |
|
with st.spinner("Thinking..."): |
|
|
|
rag_agent = create_rag_agent(st.session_state.vectorstore) |
|
|
|
|
|
result = rag_agent.invoke({ |
|
"messages": [HumanMessage(content=question)], |
|
"documents": [], |
|
"thinking": "" |
|
}) |
|
|
|
|
|
st.session_state.thinking_history.append(result["thinking"]) |
|
|
|
|
|
st.session_state.messages.append(result["messages"][-1]) |
|
else: |
|
|
|
st.session_state.messages.append(AIMessage(content="Please upload and process documents first.")) |
|
st.rerun() |
|
|
|
|
|
st.markdown("### π¬ Chat") |
|
chat_container = st.container() |
|
|
|
with chat_container: |
|
|
|
if st.session_state.messages: |
|
for i, message in enumerate(st.session_state.messages): |
|
if isinstance(message, HumanMessage): |
|
st.markdown(f""" |
|
<div class="user-message"> |
|
<strong>User:</strong> {message.content} |
|
</div> |
|
""", unsafe_allow_html=True) |
|
else: |
|
st.markdown(f""" |
|
<div class="assistant-message"> |
|
<strong>Assistant:</strong> {message.content} |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
if "thinking_history" in st.session_state and i//2 < len(st.session_state.thinking_history): |
|
thinking = st.session_state.thinking_history[i//2] |
|
|
|
|
|
thinking_key = f"thinking_{i//2}" |
|
|
|
|
|
if thinking_key not in st.session_state: |
|
st.session_state[thinking_key] = False |
|
|
|
|
|
toggle_text = "Show thinking" if not st.session_state[thinking_key] else "Hide thinking" |
|
|
|
|
|
if st.button(toggle_text, key=f"toggle_{thinking_key}"): |
|
st.session_state[thinking_key] = not st.session_state[thinking_key] |
|
st.rerun() |
|
|
|
|
|
if st.session_state[thinking_key]: |
|
with st.expander("Thinking Process", expanded=True): |
|
st.write(thinking) |
|
else: |
|
st.info("Upload documents and start asking questions!") |
|
|
|
|
|
st.markdown("### Ask a question about your documents") |
|
with st.form(key="chat_form", clear_on_submit=True): |
|
user_input = st.text_input("Type your question here...", key="user_question", label_visibility="collapsed") |
|
cols = st.columns([6, 1]) |
|
with cols[0]: |
|
submit_button = st.form_submit_button("Ask", use_container_width=True) |
|
|
|
if submit_button and user_input: |
|
|
|
st.session_state.messages.append(HumanMessage(content=user_input)) |
|
|
|
|
|
if "vectorstore" in st.session_state: |
|
with st.spinner("Thinking..."): |
|
|
|
rag_agent = create_rag_agent(st.session_state.vectorstore) |
|
|
|
|
|
result = rag_agent.invoke({ |
|
"messages": [HumanMessage(content=user_input)], |
|
"documents": [], |
|
"thinking": "" |
|
}) |
|
|
|
|
|
st.session_state.thinking_history.append(result["thinking"]) |
|
|
|
|
|
st.session_state.messages.append(result["messages"][-1]) |
|
else: |
|
|
|
st.session_state.messages.append(AIMessage(content="Please upload and process documents first.")) |
|
|
|
|
|
st.rerun() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|