langchain-chat-with-pdf-openai-MU

Paused

App Files Files Community

langchain-chat-with-pdf-openai-MU / app.py

Pavan178

Update app.py

d78dd14 verified 7 months ago

raw

history blame

8.49 kB

	import os
	import gradio as gr
	import logging
	from langchain.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.chat_models import ChatOpenAI
	from langchain.chains import ConversationalRetrievalChain, LLMChain
	from langchain.memory import ConversationBufferMemory
	from langchain.prompts import PromptTemplate

	from PyPDF2 import PdfReader # New import for PDF metadata extraction

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class QueryRefiner:
	def __init__(self):
	self.refinement_llm = ChatOpenAI(temperature=0.2, model_name='gpt-4o')
	self.refinement_prompt = PromptTemplate(
	input_variables=['query', 'context'],
	template="""Refine and enhance the following query for maximum clarity and precision:
	Original Query: {query}
	Document Context: {context}
	Enhanced Query Requirements:
	- Restructure for optimal comprehension
	- Rewrite the question to the best context and structure of output desired
	Refined Query:"""
	)
	self.refinement_chain = LLMChain(
	llm=self.refinement_llm,
	prompt=self.refinement_prompt
	)

	def refine_query(self, original_query, context_hints=''):
	try:
	refined_query = self.refinement_chain.run({
	'query': original_query,
	'context': context_hints or "General academic document"
	})
	return refined_query.strip()
	except Exception as e:
	logger.error(f"Query refinement error: {e}")
	return original_query

	class AdvancedPdfChatbot:
	def __init__(self, openai_api_key):
	os.environ["OPENAI_API_KEY"] = openai_api_key
	self.embeddings = OpenAIEmbeddings()
	self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	self.llm = ChatOpenAI(temperature=0, model_name='gpt-4o')

	self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
	self.query_refiner = QueryRefiner()
	self.db = None
	self.chain = None
	self.document_metadata = {} # Store extracted document metadata

	self.qa_prompt = PromptTemplate(
	template="""You are an expert academic assistant analyzing a document. Provide well structured response in Markdown
	Context: {context}
	Question: {question}
	Provide a comprehensive, precise answer based strictly on the document's content.


	Use this different formats for different contexts:

	example format 1:
	- Short summary of the response with a relevant title
	- Headlines and bullet points with descriptions with breakdowns of each topic and details
	- Conclusion

	example format 2:

	Precise pragraph with headlines and a paragraph

	example format 3:
	Numbered bullet points or ordered lists

	Use more such formats to suit the user given context

	NOTE: Give precise and short answers when asked about specific terms and summaries of specific topics.
	If the answer isn't directly available, explain why. """,
	input_variables=["context", "question"]
	)

	def load_and_process_pdf(self, pdf_path):
	try:
	self._extract_pdf_metadata(pdf_path) # Extract metadata (title, author, etc.)
	loader = PyPDFLoader(pdf_path)
	documents = loader.load()
	texts = self.text_splitter.split_documents(documents)
	self.db = FAISS.from_documents(texts, self.embeddings)

	self.chain = ConversationalRetrievalChain.from_llm(
	llm=self.llm,
	retriever=self.db.as_retriever(search_kwargs={"k": 3}),
	memory=self.memory,
	combine_docs_chain_kwargs={"prompt": self.qa_prompt}
	)

	# Extract document context and store it in memory
	document_context = self._extract_document_type()
	logger.info(f"Extracted document context: {document_context}")

	# Save document context in memory properly
	self.memory.save_context({"input": "System"}, {"output": f"Document context: {document_context}"})

	except Exception as e:
	logger.error(f"PDF processing error: {e}")
	raise e

	def chat(self, query):
	if not self.chain:
	return "Please upload a PDF first."

	context_hints = self._extract_document_type()
	refined_query = self.query_refiner.refine_query(query, context_hints)
	print(refined_query,context_hints)
	result = self.chain({"question": refined_query})
	return result['answer']

	def _extract_document_type(self):
	"""Extract detailed document characteristics"""
	if not self.db:
	return "No document loaded"
	try:
	first_doc = list(self.db.docstore._dict.values())[0].page_content[:1000]
	headings = self._extract_headings(first_doc)
	context_details = {
	"Title": self.document_metadata.get('title', 'Unknown Title'),
	"Author": self.document_metadata.get('author', 'Unknown Author'),
	"First Snippet": first_doc[:300],
	"Headings": headings
	}
	context_str = f"Title: {context_details['Title']}, Author: {context_details['Author']}, Headings: {context_details['Headings']}"
	return context_str
	except Exception as e:
	logger.error(f"Error extracting document type: {e}")
	return "Academic/technical document"

	def _extract_pdf_metadata(self, pdf_path):
	"""Extract metadata like title, author, and creation date"""
	try:
	reader = PdfReader(pdf_path)
	self.document_metadata = {
	"title": reader.metadata.get("/Title", "Unknown Title"),
	"author": reader.metadata.get("/Author", "Unknown Author"),
	"creation_date": reader.metadata.get("/CreationDate", "Unknown Date")
	}
	logger.info(f"Extracted PDF metadata: {self.document_metadata}")
	except Exception as e:
	logger.error(f"Error extracting PDF metadata: {e}")
	self.document_metadata = {}

	def _extract_headings(self, text):
	"""Extract headings from the first document's content"""
	try:
	headings = [line for line in text.split("\n") if line.strip().istitle()]
	return ', '.join(headings[:5]) # Return the first 5 headings
	except Exception as e:
	logger.error(f"Error extracting headings: {e}")
	return "No headings found"

	def clear_memory(self):
	self.memory.clear()

	# Gradio Interface
	pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY"))

	def upload_pdf(pdf_file):
	if pdf_file is None:
	return "Please upload a PDF file."
	file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
	try:
	pdf_chatbot.load_and_process_pdf(file_path)
	return f"PDF processed successfully: {file_path}"
	except Exception as e:
	logger.error(f"PDF processing error: {e}")
	return f"Error processing PDF: {str(e)}"

	def respond(message, history):
	if not message:
	return "", history
	try:
	bot_message = pdf_chatbot.chat(message)
	history.append((message, bot_message))
	return "", history
	except Exception as e:
	logger.error(f"Chat response error: {e}")
	return f"Error: {str(e)}", history

	def clear_chatbot():
	pdf_chatbot.clear_memory()
	return []

	# Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# Advanced PDF Chatbot")
	with gr.Row():
	pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
	upload_button = gr.Button("Process PDF")

	upload_status = gr.Textbox(label="Upload Status")
	upload_button.click(upload_pdf, inputs=[pdf_upload], outputs=[upload_status])
	chatbot_interface = gr.Chatbot()
	msg = gr.Textbox(placeholder="Enter your query...")
	msg.submit(respond, inputs=[msg, chatbot_interface], outputs=[msg, chatbot_interface])
	clear_button = gr.Button("Clear Conversation")
	clear_button.click(clear_chatbot, outputs=[chatbot_interface])

	if __name__ == "__main__":
	demo.launch()