import os print(os.system("apt-get update && apt-get install -y poppler-utils")) os.system("apt-get update && apt-get install -y tesseract-ocr") import tempfile import streamlit as st from PIL import Image import pytesseract # pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" from pdf2image import convert_from_path import pypdf from dotenv import load_dotenv import time from langchain_core.messages import HumanMessage, AIMessage, SystemMessage from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_core.output_parsers import StrOutputParser from langchain_together import Together from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings import langgraph from langgraph.graph import END from typing import List, Dict, Any, TypedDict, Optional # Load environment variables load_dotenv() # Set page configuration st.set_page_config( page_title="Document Q&A", page_icon="📚", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for better UI st.markdown(""" """, unsafe_allow_html=True) # Example questions EXAMPLE_QUESTIONS = [ "How do the different topics in these documents relate to each other?", "What is the structure of this document?", "Can you analyze the writing style of this text?", "Extract all dates and events mentioned in the document", "What are the main arguments presented in this document?" ] # Initialize the LLM @st.cache_resource def get_llm(): return Together( model="deepseek-ai/DeepSeek-V3", temperature=0.7, max_tokens=1024 ) # Initialize embeddings @st.cache_resource def get_embeddings(): return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # Initialize text splitter @st.cache_resource def get_text_splitter(): return RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) # Function to extract text from PDF def extract_text_from_pdf(pdf_file): pdf_reader = pypdf.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() or "" return text # Function to extract text from image using OCR def extract_text_from_image(image_file): image = Image.open(image_file) text = pytesseract.image_to_string(image) return text # Function to process PDF with OCR if needed def process_pdf_with_ocr(pdf_file): # First try normal text extraction text = extract_text_from_pdf(pdf_file) # If little or no text was extracted, try OCR if len(text.strip()) < 100: images = convert_from_path(pdf_file) text = "" for image in images: text += pytesseract.image_to_string(image) return text # Function to process uploaded files def process_uploaded_files(uploaded_files): all_texts = [] file_info = [] for file in uploaded_files: # Create a temporary file with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(file.getvalue()) temp_file_path = temp_file.name # Process based on file type if file.name.lower().endswith('.pdf'): text = process_pdf_with_ocr(temp_file_path) file_type = "PDF" elif file.name.lower().endswith(('.png', '.jpg', '.jpeg')): text = extract_text_from_image(temp_file_path) file_type = "Image" elif file.name.lower().endswith(('.txt', '.md')): text = file.getvalue().decode('utf-8') file_type = "Text" else: text = f"Unsupported file format: {file.name}" file_type = "Unknown" all_texts.append(f"--- Content from {file.name} ---\n{text}") file_info.append({"name": file.name, "type": file_type}) # Clean up the temporary file os.unlink(temp_file_path) return "\n\n".join(all_texts), file_info # Function to create vector store from text def create_vectorstore(text): text_splitter = get_text_splitter() chunks = text_splitter.split_text(text) # Use FAISS instead of Chroma to avoid SQLite dependency return FAISS.from_texts( texts=chunks, embedding=get_embeddings() ) # Define the state schema for the graph using TypedDict class GraphState(TypedDict): messages: List documents: List thinking: str # Define the RAG agent using LangGraph def create_rag_agent(vectorstore): # Define the retrieval component def retrieve(state: GraphState) -> GraphState: query = state["messages"][-1].content docs = vectorstore.similarity_search(query, k=5) return {"documents": docs, "messages": state["messages"], "thinking": state.get("thinking", "")} # Define the generation component with thinking step def generate(state: GraphState) -> GraphState: messages = state["messages"] documents = state["documents"] # Extract relevant context from documents context = "\n\n".join([f"Document {i+1}:\n{doc.page_content}" for i, doc in enumerate(documents)]) # First, have the model think about the query thinking_prompt = ChatPromptTemplate.from_messages([ SystemMessage(content="You are an assistant that thinks step by step before answering."), MessagesPlaceholder(variable_name="messages"), SystemMessage(content=f"Here is relevant context from the knowledge base:\n{context}\n\nThink step by step about how to answer the query using this context.") ]) thinking = thinking_prompt | get_llm() | StrOutputParser() thinking_result = thinking.invoke({"messages": messages}) # Then generate the final answer answer_prompt = ChatPromptTemplate.from_messages([ SystemMessage(content="You are a helpful assistant that provides accurate information based on the given context."), MessagesPlaceholder(variable_name="messages"), SystemMessage(content=f"Here is relevant context from the knowledge base:\n{context}\n\nHere is your thinking process:\n{thinking_result}\n\nNow provide a clear and helpful answer based on this context and thinking.") ]) answer = answer_prompt | get_llm() | StrOutputParser() response = answer.invoke({"messages": messages}) return { "messages": messages + [AIMessage(content=response)], "thinking": thinking_result, "documents": documents } # Create the graph from langgraph.graph import StateGraph workflow = StateGraph(GraphState) workflow.add_node("retrieve", retrieve) workflow.add_node("generate", generate) workflow.set_entry_point("retrieve") workflow.add_edge("retrieve", "generate") workflow.add_edge("generate", END) # Compile the graph app = workflow.compile() return app # Function to clear all session state def clear_session_state(): for key in list(st.session_state.keys()): del st.session_state[key] # Main app layout def main(): # Initialize session state for showing examples if "show_examples" not in st.session_state: st.session_state.show_examples = True # Initialize messages if not exists if "messages" not in st.session_state: st.session_state.messages = [] # Initialize thinking history if not exists if "thinking_history" not in st.session_state: st.session_state.thinking_history = [] # Sidebar for document upload and controls with st.sidebar: st.markdown('
', unsafe_allow_html=True) st.markdown("""