langchain-chat-with-pdf-openai

Sleeping

File size: 6,008 Bytes

d8c3a88
d2e3c7f
4277202
d2e3c7f
 
 
 
6c5c0ad
6a6fbcd
d2e3c7f
d43bb1b
4277202
 
 
 
 
 
633ac28
6a6fbcd
 
1e82c8e
6a6fbcd
 
 
 
 
 
 
 
 
cd5ae07
579fd0d
6a6fbcd
 
 
 
 
 
 
 
889981c
6a6fbcd
 
 
 
 
 
 
 
4277202
6a6fbcd
 
d2e3c7f
 
 
1e82c8e
d2e3c7f
1e82c8e
355b657
d2e3c7f
6a6fbcd
3f31c68
ccff99d
3f31c68
ccff99d
579fd0d
6a6fbcd
 
 
 
 
3aa1446
 
 
 
7e2233e
 
 
579fd0d
ccff99d
 
5e8e8f0
f8d8d78
d2e3c7f
 
 
 
f74eb2e
873a6e6
ccff99d
 
6a6fbcd
b840efb
ccff99d
b840efb
a261843
f8d8d78
d2e3c7f
ccff99d
d2e3c7f
ccff99d
6a6fbcd
 
 
 
d2e3c7f
 
6a6fbcd
 
 
 
 
 
 
 
 
 
ccff99d
 
ff0e62c
6a6fbcd
ccff99d
d2e3c7f
 
 
 
8af0aff
 
 
 
 
4277202
8af0aff
d2e3c7f
 
8af0aff
 
 
 
 
 
 
4277202
8af0aff
d2e3c7f
f74eb2e
ccff99d
f74eb2e
 
6a6fbcd
d2e3c7f
6a6fbcd
f74eb2e
d2e3c7f
 
 
5e8e8f0
d2e3c7f
 
6a6fbcd
d2e3c7f
6a6fbcd
d2e3c7f
6a6fbcd
 
 
5e8e8f0
d2e3c7f
91326a4

import os
import gradio as gr
import logging
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
import concurrent.futures
import timeout_decorator

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class QueryRefiner:
    def __init__(self):
        self.refinement_llm = ChatOpenAI(temperature=0.2, model_name='gpt-4o')
        self.refinement_prompt = PromptTemplate(
            input_variables=['query', 'context'],
            template="""Refine and enhance the following query for maximum clarity and precision:

Original Query: {query}
Document Context: {context}

Enhanced Query Requirements:
- Restructure for optimal comprehension
- rewrite the original query for best comprehension for getting all the details in great attention to details
- Use specific structure and the response be according to context such as paragraphs or bullet points, headlines and subtexts

Refined Query:"""
        )
        self.refinement_chain = LLMChain(
            llm=self.refinement_llm, 
            prompt=self.refinement_prompt
        )


    def refine_query(self, original_query, context_hints=''):
        try:
            refined_query = self.refinement_chain.run({
                'query': original_query, 
                'context': context_hints or "General academic document"
            })
            return refined_query.strip()
        except Exception as e:
            logger.error(f"Query refinement error: {e}")
            return original_query

class AdvancedPdfChatbot:
    def __init__(self, openai_api_key):
        os.environ["OPENAI_API_KEY"] = openai_api_key
        self.embeddings = OpenAIEmbeddings()
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        self.llm = ChatOpenAI(temperature=0, model_name='gpt-4o')
        
        self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
        self.query_refiner = QueryRefiner()
        self.db = None
        self.chain = None
        
        self.qa_prompt = PromptTemplate(
            template="""You are an expert academic assistant analyzing a document. Provide well structured response in Markdown

Context: {context}
Question: {question}

Provide a comprehensive, precise answer based strictly on the document's content.
Use this format: 
- Short summary of the response with a relevant title
- Headlines and bullet points with descriptions with breakdowns of each topics and details
- Conclusion

NOTE: Give precise and short answers when asked about specific terms and summary of specific topic

If the answer isn't directly available, explain why. """,
            input_variables=["context", "question"]
        )


    def load_and_process_pdf(self, pdf_path):
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        texts = self.text_splitter.split_documents(documents)
        self.db = FAISS.from_documents(texts, self.embeddings)
        
        self.chain = ConversationalRetrievalChain.from_llm(
            llm=self.llm,
            retriever=self.db.as_retriever(search_kwargs={"k": 3}),
            memory=self.memory,
            combine_docs_chain_kwargs={"prompt": self.qa_prompt}
        )


    def chat(self, query):
        if not self.chain:
            return "Please upload a PDF first."
        
        context_hints = self._extract_document_type()
        refined_query = self.query_refiner.refine_query(query, context_hints)
        
        result = self.chain({"question": refined_query})
        return result['answer']

    def _extract_document_type(self):
        """Extract basic document characteristics"""
        if not self.db:
            return ""
        try:
            first_doc = list(self.db.docstore._dict.values())[0].page_content[:500]
            return f"Document appears to cover: {first_doc[:100]}..."
        except:
            return "Academic/technical document"

    def clear_memory(self):
        self.memory.clear()

# Gradio Interface
pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY"))

def upload_pdf(pdf_file):
    if pdf_file is None:
        return "Please upload a PDF file."
    file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
    try:
        pdf_chatbot.load_and_process_pdf(file_path)
        return f"PDF processed successfully: {file_path}"
    except Exception as e:
        logger.error(f"PDF processing error: {e}")
        return f"Error processing PDF: {str(e)}"

def respond(message, history):
    if not message:
        return "", history
    try:
        bot_message = pdf_chatbot.chat(message)
        history.append((message, bot_message))
        return "", history
    except Exception as e:
        logger.error(f"Chat response error: {e}")
        return f"Error: {str(e)}", history

def clear_chatbot():
    pdf_chatbot.clear_memory()
    return []

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Advanced PDF Chatbot")
    
    with gr.Row():
        pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
        upload_button = gr.Button("Process PDF")

    upload_status = gr.Textbox(label="Upload Status")
    upload_button.click(upload_pdf, inputs=[pdf_upload], outputs=[upload_status])
    
    chatbot_interface = gr.Chatbot()
    msg = gr.Textbox(placeholder="Enter your query...")
    msg.submit(respond, inputs=[msg, chatbot_interface], outputs=[msg, chatbot_interface])
    
    clear_button = gr.Button("Clear Conversation")
    clear_button.click(clear_chatbot, outputs=[chatbot_interface])

if __name__ == "__main__":
    demo.launch()