langchain-chat-with-pdf-openai-MU

Paused

App Files Files Community

Pavan178 commited on Dec 9, 2024

Commit

7f36a98

verified ·

1 Parent(s): 0b367de

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -30

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from langchain.memory import ConversationBufferMemory
 from langchain.prompts import PromptTemplate
 import concurrent.futures
 import timeout_decorator
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -22,15 +23,12 @@ class QueryRefiner:
         self.refinement_prompt = PromptTemplate(
             input_variables=['query', 'context'],
             template="""Refine and enhance the following query for maximum clarity and precision:
 Original Query: {query}
 Document Context: {context}
 Enhanced Query Requirements:
 - Restructure for optimal comprehension
-- rewrite the original query for best comprehension for getting all the details in great attention to details
-- Use specific structure and the response be according to context such as paragraphs or bullet points, headlines and subtexts
 Refined Query:"""
         )
         self.refinement_chain = LLMChain(
@@ -38,7 +36,6 @@ Refined Query:"""
             prompt=self.refinement_prompt
         )
     def refine_query(self, original_query, context_hints=''):
         try:
             refined_query = self.refinement_chain.run({
@@ -61,39 +58,45 @@ class AdvancedPdfChatbot:
         self.query_refiner = QueryRefiner()
         self.db = None
         self.chain = None
         self.qa_prompt = PromptTemplate(
             template="""You are an expert academic assistant analyzing a document. Provide well structured response in Markdown
 Context: {context}
 Question: {question}
 Provide a comprehensive, precise answer based strictly on the document's content.
 Use this format:
 - Short summary of the response with a relevant title
-- Headlines and bullet points with descriptions with breakdowns of each topics and details
 - Conclusion
-NOTE: Give precise and short answers when asked about specific terms and summary of specific topic
 If the answer isn't directly available, explain why. """,
             input_variables=["context", "question"]
         )
     def load_and_process_pdf(self, pdf_path):
-        loader = PyPDFLoader(pdf_path)
-        documents = loader.load()
-        texts = self.text_splitter.split_documents(documents)
-        self.db = FAISS.from_documents(texts, self.embeddings)
-        self.chain = ConversationalRetrievalChain.from_llm(
-            llm=self.llm,
-            retriever=self.db.as_retriever(search_kwargs={"k": 3}),
-            memory=self.memory,
-            combine_docs_chain_kwargs={"prompt": self.qa_prompt}
-        )
     def chat(self, query):
         if not self.chain:
@@ -106,15 +109,48 @@ If the answer isn't directly available, explain why. """,
         return result['answer']
     def _extract_document_type(self):
-        """Extract basic document characteristics"""
         if not self.db:
-            return ""
         try:
-            first_doc = list(self.db.docstore._dict.values())[0].page_content[:500]
-            return f"Document appears to cover: {first_doc[:100]}..."
-        except:
             return "Academic/technical document"
     def clear_memory(self):
         self.memory.clear()
@@ -166,4 +202,4 @@ with gr.Blocks() as demo:
     clear_button.click(clear_chatbot, outputs=[chatbot_interface])
 if __name__ == "__main__":
-    demo.launch()

 from langchain.prompts import PromptTemplate
 import concurrent.futures
 import timeout_decorator
+from PyPDF2 import PdfReader  # New import for PDF metadata extraction
 # Configure logging
 logging.basicConfig(level=logging.INFO)
         self.refinement_prompt = PromptTemplate(
             input_variables=['query', 'context'],
             template="""Refine and enhance the following query for maximum clarity and precision:
 Original Query: {query}
 Document Context: {context}
 Enhanced Query Requirements:
 - Restructure for optimal comprehension
+- Rewrite the original query for the best comprehension and attention to detail
+- Use specific structure; response should include paragraphs, bullet points, headlines, and subtexts
 Refined Query:"""
         )
         self.refinement_chain = LLMChain(
             prompt=self.refinement_prompt
         )
     def refine_query(self, original_query, context_hints=''):
         try:
             refined_query = self.refinement_chain.run({
         self.query_refiner = QueryRefiner()
         self.db = None
         self.chain = None
+        self.document_metadata = {}  # Store extracted document metadata
         self.qa_prompt = PromptTemplate(
             template="""You are an expert academic assistant analyzing a document. Provide well structured response in Markdown
 Context: {context}
 Question: {question}
 Provide a comprehensive, precise answer based strictly on the document's content.
 Use this format:
 - Short summary of the response with a relevant title
+- Headlines and bullet points with descriptions with breakdowns of each topic and details
 - Conclusion
+NOTE: Give precise and short answers when asked about specific terms and summaries of specific topics.
 If the answer isn't directly available, explain why. """,
             input_variables=["context", "question"]
         )
     def load_and_process_pdf(self, pdf_path):
+        try:
+            self._extract_pdf_metadata(pdf_path)  # Extract metadata (title, author, etc.)
+            loader = PyPDFLoader(pdf_path)
+            documents = loader.load()
+            texts = self.text_splitter.split_documents(documents)
+            self.db = FAISS.from_documents(texts, self.embeddings)
+            self.chain = ConversationalRetrievalChain.from_llm(
+                llm=self.llm,
+                retriever=self.db.as_retriever(search_kwargs={"k": 3}),
+                memory=self.memory,
+                combine_docs_chain_kwargs={"prompt": self.qa_prompt}
+            )
+            # Extract document context and store it in memory
+            document_context = self._extract_document_type()
+            logger.info(f"Extracted document context: {document_context}")
+            self.memory.chat_history.append(("System", f"Document context: {document_context}"))
+        except Exception as e:
+            logger.error(f"PDF processing error: {e}")
+            raise e
     def chat(self, query):
         if not self.chain:
         return result['answer']
     def _extract_document_type(self):
+        """Extract detailed document characteristics"""
         if not self.db:
+            return "No document loaded"
         try:
+            first_doc = list(self.db.docstore._dict.values())[0].page_content[:1000]
+            headings = self._extract_headings(first_doc)
+            context_details = {
+                "Title": self.document_metadata.get('title', 'Unknown Title'),
+                "Author": self.document_metadata.get('author', 'Unknown Author'),
+                "First Snippet": first_doc[:300],
+                "Headings": headings
+            }
+            context_str = f"Title: {context_details['Title']}, Author: {context_details['Author']}, Headings: {context_details['Headings']}"
+            return context_str
+        except Exception as e:
+            logger.error(f"Error extracting document type: {e}")
             return "Academic/technical document"
+    def _extract_pdf_metadata(self, pdf_path):
+        """Extract metadata like title, author, and creation date"""
+        try:
+            reader = PdfReader(pdf_path)
+            self.document_metadata = {
+                "title": reader.metadata.get("/Title", "Unknown Title"),
+                "author": reader.metadata.get("/Author", "Unknown Author"),
+                "creation_date": reader.metadata.get("/CreationDate", "Unknown Date")
+            }
+            logger.info(f"Extracted PDF metadata: {self.document_metadata}")
+        except Exception as e:
+            logger.error(f"Error extracting PDF metadata: {e}")
+            self.document_metadata = {}
+    def _extract_headings(self, text):
+        """Extract headings from the first document's content"""
+        try:
+            # Simple heuristic: Extract lines with uppercase or title-case words (like headings)
+            headings = [line for line in text.split("\n") if line.strip().istitle()]
+            return ', '.join(headings[:5])  # Return the first 5 headings
+        except Exception as e:
+            logger.error(f"Error extracting headings: {e}")
+            return "No headings found"
     def clear_memory(self):
         self.memory.clear()
     clear_button.click(clear_chatbot, outputs=[chatbot_interface])
 if __name__ == "__main__":
+    demo.launch()