import os import gradio as gr import logging from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import FAISS from langchain.chat_models import ChatOpenAI from langchain.chains import ConversationalRetrievalChain, LLMChain from langchain.memory import ConversationBufferMemory from langchain.prompts import PromptTemplate import concurrent.futures import timeout_decorator # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class QueryRefiner: def __init__(self): self.refinement_llm = ChatOpenAI(temperature=0.2, model_name='gpt-3.5-turbo', request_timeout=30) self.refinement_prompt = PromptTemplate( input_variables=['query', 'context'], template="""Refine and enhance the following query for maximum clarity and precision: Original Query: {query} Document Context: {context} Enhanced Query Requirements: - Clarify any ambiguous terms - Add specific context-driven details - Ensure precise information retrieval - Restructure for optimal comprehension Refined Query:""" ) self.refinement_chain = LLMChain( llm=self.refinement_llm, prompt=self.refinement_prompt ) def refine_query(self, original_query, context_hints=''): try: refined_query = self.refinement_chain.run({ 'query': original_query, 'context': context_hints or "General academic document" }) return refined_query.strip() except Exception as e: logger.error(f"Query refinement error: {e}") return original_query class AdvancedPdfChatbot: def __init__(self, openai_api_key): os.environ["OPENAI_API_KEY"] = openai_api_key self.embeddings = OpenAIEmbeddings(request_timeout=30) self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) self.llm = ChatOpenAI(temperature=0, model_name='gpt-4', request_timeout=30) self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) self.query_refiner = QueryRefiner() self.db = None self.chain = None self.qa_prompt = PromptTemplate( template="""You are an expert academic assistant analyzing a document. Context: {context} Question: {question} Provide a comprehensive, precise answer based strictly on the document's content. If the answer isn't directly available, explain why. Try to structure your response according to context such as paragraphs or bullet points, headlines and subtexts""", input_variables=["context", "question"] ) def load_and_process_pdf(self, pdf_path): loader = PyPDFLoader(pdf_path) documents = loader.load() texts = self.text_splitter.split_documents(documents) self.db = FAISS.from_documents(texts, self.embeddings) self.chain = ConversationalRetrievalChain.from_llm( llm=self.llm, retriever=self.db.as_retriever(search_kwargs={"k": 3}), memory=self.memory, combine_docs_chain_kwargs={"prompt": self.qa_prompt} ) def chat(self, query): if not self.chain: return "Please upload a PDF first." context_hints = self._extract_document_type() refined_query = self.query_refiner.refine_query(query, context_hints) result = self.chain({"question": refined_query}) return result['answer'] def _extract_document_type(self): """Extract basic document characteristics""" if not self.db: return "" try: first_doc = list(self.db.docstore._dict.values())[0].page_content[:500] return f"Document appears to cover: {first_doc[:100]}..." except: return "Academic/technical document" def clear_memory(self): self.memory.clear() # Gradio Interface pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY")) def upload_pdf(pdf_file): if pdf_file is None: return "Please upload a PDF file." file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file try: pdf_chatbot.load_and_process_pdf(file_path) return f"PDF processed successfully: {file_path}" except Exception as e: logger.error(f"PDF processing error: {e}") return f"Error processing PDF: {str(e)}" def respond(message, history): if not message: return "", history try: bot_message = pdf_chatbot.chat(message) history.append((message, bot_message)) return "", history except Exception as e: logger.error(f"Chat response error: {e}") return f"Error: {str(e)}", history def clear_chatbot(): pdf_chatbot.clear_memory() return [] # Gradio UI with gr.Blocks() as demo: gr.Markdown("# Advanced PDF Chatbot") with gr.Row(): pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"]) upload_button = gr.Button("Process PDF") upload_status = gr.Textbox(label="Upload Status") upload_button.click(upload_pdf, inputs=[pdf_upload], outputs=[upload_status]) chatbot_interface = gr.Chatbot() msg = gr.Textbox(placeholder="Enter your query...") msg.submit(respond, inputs=[msg, chatbot_interface], outputs=[msg, chatbot_interface]) clear_button = gr.Button("Clear Conversation") clear_button.click(clear_chatbot, outputs=[chatbot_interface]) if __name__ == "__main__": demo.launch()