langchain-chat-with-pdf-openai

Sleeping

App Files Files Community

langchain-chat-with-pdf-openai / app.py

Pavan178

Update app.py

45bb9f0 verified 5 months ago

raw

history blame

5.73 kB

	import os
	import gradio as gr
	import logging
	from langchain.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.chat_models import ChatOpenAI
	from langchain.chains import ConversationalRetrievalChain, LLMChain
	from langchain.memory import ConversationBufferMemory
	from langchain.prompts import PromptTemplate
	import concurrent.futures
	import timeout_decorator

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class QueryRefiner:
	def __init__(self):
	self.refinement_llm = ChatOpenAI(temperature=0.2, model_name='gpt-3.5-turbo', request_timeout=30)
	self.refinement_prompt = PromptTemplate(
	input_variables=['query', 'context'],
	template="""Refine and enhance the following query for maximum clarity and precision:

	Original Query: {query}
	Document Context: {context}

	Enhanced Query Requirements:
	- Clarify any ambiguous terms
	- Add specific context-driven details
	- Ensure precise information retrieval
	- Restructure for optimal comprehension

	Refined Query:"""
	)
	self.refinement_chain = LLMChain(
	llm=self.refinement_llm,
	prompt=self.refinement_prompt
	)


	def refine_query(self, original_query, context_hints=''):
	try:
	refined_query = self.refinement_chain.run({
	'query': original_query,
	'context': context_hints or "General academic document"
	})
	return refined_query.strip()
	except Exception as e:
	logger.error(f"Query refinement error: {e}")
	return original_query

	class AdvancedPdfChatbot:
	def __init__(self, openai_api_key):
	os.environ["OPENAI_API_KEY"] = openai_api_key
	self.embeddings = OpenAIEmbeddings(request_timeout=30)
	self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	self.llm = ChatOpenAI(temperature=0, model_name='gpt-4', request_timeout=30)

	self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
	self.query_refiner = QueryRefiner()
	self.db = None
	self.chain = None

	self.qa_prompt = PromptTemplate(
	template="""You are an expert academic assistant analyzing a document.

	Context: {context}
	Question: {question}

	Provide a comprehensive, precise answer based strictly on the document's content.
	If the answer isn't directly available, explain why. Try to structure your response according to context such as paragraphs or bullet points, headlines and subtexts""",
	input_variables=["context", "question"]
	)


	def load_and_process_pdf(self, pdf_path):
	loader = PyPDFLoader(pdf_path)
	documents = loader.load()
	texts = self.text_splitter.split_documents(documents)
	self.db = FAISS.from_documents(texts, self.embeddings)

	self.chain = ConversationalRetrievalChain.from_llm(
	llm=self.llm,
	retriever=self.db.as_retriever(search_kwargs={"k": 3}),
	memory=self.memory,
	combine_docs_chain_kwargs={"prompt": self.qa_prompt}
	)


	def chat(self, query):
	if not self.chain:
	return "Please upload a PDF first."

	context_hints = self._extract_document_type()
	refined_query = self.query_refiner.refine_query(query, context_hints)

	result = self.chain({"question": refined_query})
	return result['answer']

	def _extract_document_type(self):
	"""Extract basic document characteristics"""
	if not self.db:
	return ""
	try:
	first_doc = list(self.db.docstore._dict.values())[0].page_content[:500]
	return f"Document appears to cover: {first_doc[:100]}..."
	except:
	return "Academic/technical document"

	def clear_memory(self):
	self.memory.clear()

	# Gradio Interface
	pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY"))

	def upload_pdf(pdf_file):
	if pdf_file is None:
	return "Please upload a PDF file."
	file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
	try:
	pdf_chatbot.load_and_process_pdf(file_path)
	return f"PDF processed successfully: {file_path}"
	except Exception as e:
	logger.error(f"PDF processing error: {e}")
	return f"Error processing PDF: {str(e)}"

	def respond(message, history):
	if not message:
	return "", history
	try:
	bot_message = pdf_chatbot.chat(message)
	history.append((message, bot_message))
	return "", history
	except Exception as e:
	logger.error(f"Chat response error: {e}")
	return f"Error: {str(e)}", history

	def clear_chatbot():
	pdf_chatbot.clear_memory()
	return []

	# Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# Advanced PDF Chatbot")

	with gr.Row():
	pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
	upload_button = gr.Button("Process PDF")

	upload_status = gr.Textbox(label="Upload Status")
	upload_button.click(upload_pdf, inputs=[pdf_upload], outputs=[upload_status])

	chatbot_interface = gr.Chatbot()
	msg = gr.Textbox(placeholder="Enter your query...")
	msg.submit(respond, inputs=[msg, chatbot_interface], outputs=[msg, chatbot_interface])

	clear_button = gr.Button("Clear Conversation")
	clear_button.click(clear_chatbot, outputs=[chatbot_interface])

	if __name__ == "__main__":
	demo.launch()