langchain-chat-with-pdf-openai

Sleeping

File size: 5,079 Bytes

d8c3a88
d2e3c7f
4277202
d2e3c7f
 
 
 
6c5c0ad
6a6fbcd
d2e3c7f
d43bb1b
75fd4bb
4b219d0
58bf31d
75fd4bb
 
58bf31d
 
 
 
75fd4bb
 
58bf31d
75fd4bb
58bf31d
 
 
 
 
 
75fd4bb
58bf31d
75fd4bb
58bf31d
75fd4bb
58bf31d
75fd4bb
58bf31d
e8434ee
58bf31d
 
 
 
 
e8434ee
 
58bf31d
e8434ee
6a6fbcd
58bf31d
e8434ee
6a6fbcd
d2e3c7f
 
 
58bf31d
75fd4bb
1e82c8e
58bf31d
355b657
d2e3c7f
58bf31d
75fd4bb
3f31c68
58bf31d
5e8e8f0
d2e3c7f
7f36a98
75fd4bb
58bf31d
 
75fd4bb
 
58bf31d
7f36a98
 
 
 
58bf31d
 
7f36a98
75fd4bb
7f36a98
58bf31d
75fd4bb
f8d8d78
d2e3c7f
58bf31d
 
75fd4bb
58bf31d
 
75fd4bb
58bf31d
 
 
 
 
 
ccff99d
58bf31d
 
75fd4bb
58bf31d
d2e3c7f
58bf31d
75fd4bb
6a6fbcd
75fd4bb
 
 
 
 
7f36a98
75fd4bb
 
 
 
 
 
 
7f36a98
6a6fbcd
d2e3c7f
6a6fbcd
d2e3c7f
 
 
5e8e8f0
d2e3c7f
 
58bf31d
d2e3c7f
6a6fbcd
d2e3c7f
5e8e8f0
d2e3c7f
58bf31d

import os
import gradio as gr
import logging
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from PyPDF2 import PdfReader

class ContextAwareResponseGenerator:
    def __init__(self, llm):
        self.llm = llm
        self.response_prompt = PromptTemplate(
            input_variables=['context', 'query', 'chat_history'],
            template="""Analyze the context, query, and chat history to generate an optimal response:

Context: {context}
Query: {query}
Chat History: {chat_history}

Response Structure Selection Criteria:
1. Technical academic breakdown
2. Concise summary with key points
3. Markdown with hierarchical insights
4. Narrative explanation
5. Comparative analysis

Choose the most appropriate response structure (1-5) and generate the response accordingly:"""
        )
        self.response_chain = LLMChain(llm=self.llm, prompt=self.response_prompt)

    def generate_response(self, context, query, chat_history=''):
        try:
            # Generate structured response
            full_response = self.response_chain.run({
                'context': context,
                'query': query,
                'chat_history': chat_history or "No previous context"
            })
            
            # Extract only the actual response content (after the structure selection)
            response_content = full_response[1:].strip()
            
            return response_content
        except Exception as e:
            logging.error(f"Response generation error: {e}")
            return f"I couldn't generate a response for: {query}"

class AdvancedPdfChatbot:
    def __init__(self, openai_api_key):
        os.environ["OPENAI_API_KEY"] = openai_api_key
        self.llm = ChatOpenAI(temperature=0.2, model_name='gpt-4o')
        
        self.embeddings = OpenAIEmbeddings()
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        
        self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
        self.response_generator = ContextAwareResponseGenerator(self.llm)
        
        self.db = None
        self.document_context = ""

    def load_and_process_pdf(self, pdf_path):
        try:
            reader = PdfReader(pdf_path)
            metadata = {
                "title": reader.metadata.get("/Title", "Untitled"),
                "author": reader.metadata.get("/Author", "Unknown")
            }
            
            loader = PyPDFLoader(pdf_path)
            documents = loader.load()
            texts = self.text_splitter.split_documents(documents)
            
            self.db = FAISS.from_documents(texts[:50], self.embeddings)
            self.document_context = f"Document: {metadata['title']} by {metadata['author']}"
            
            return True
        except Exception as e:
            logging.error(f"PDF processing error: {e}")
            return False

    def chat(self, query):
        if not self.db:
            return "Please upload a PDF first."
        
        # Retrieve chat history
        chat_history = self.memory.load_memory_variables({}).get('chat_history', [])
        
        # Generate context-aware response
        response = self.response_generator.generate_response(
            context=self.document_context, 
            query=query, 
            chat_history=str(chat_history)
        )
        
        # Store conversation in memory
        self.memory.save_context({"input": query}, {"output": response})
        
        return response

# Gradio Interface
pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY"))

def upload_pdf(pdf_file):
    if not pdf_file:
        return "Upload a PDF file."
    file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
    return "PDF processed successfully" if pdf_chatbot.load_and_process_pdf(file_path) else "Processing failed"

def respond(message, history):
    try:
        bot_message = pdf_chatbot.chat(message)
        history.append((message, bot_message))
        return "", history
    except Exception as e:
        return f"Error: {e}", history

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Advanced PDF Chatbot")
    with gr.Row():
        pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
        upload_button = gr.Button("Process PDF")

    upload_status = gr.Textbox(label="Upload Status")
    upload_button.click(upload_pdf, inputs=[pdf_upload], outputs=[upload_status])
    
    chatbot_interface = gr.Chatbot()
    msg = gr.Textbox(placeholder="Enter your query...")
    msg.submit(respond, inputs=[msg, chatbot_interface], outputs=[msg, chatbot_interface])

if __name__ == "__main__":
    demo.launch()