langchain-chat-with-pdf-openai-MU

Paused

App Files Files Community

Pavan178 commited on Dec 9, 2024

Commit

75fd4bb

verified ·

1 Parent(s): d78dd14

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -106

app.py CHANGED Viewed

@@ -9,38 +9,56 @@ from langchain.chat_models import ChatOpenAI
 from langchain.chains import ConversationalRetrievalChain, LLMChain
 from langchain.memory import ConversationBufferMemory
 from langchain.prompts import PromptTemplate
-from PyPDF2 import PdfReader  # New import for PDF metadata extraction
-# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class QueryRefiner:
-    def __init__(self):
-        self.refinement_llm = ChatOpenAI(temperature=0.2, model_name='gpt-4o')
         self.refinement_prompt = PromptTemplate(
             input_variables=['query', 'context'],
-            template="""Refine and enhance the following query for maximum clarity and precision:
 Original Query: {query}
 Document Context: {context}
-Enhanced Query Requirements:
-- Restructure for optimal comprehension
-- Rewrite the question to the best context and structure of output desired
-Refined Query:"""
-        )
-        self.refinement_chain = LLMChain(
-            llm=self.refinement_llm,
-            prompt=self.refinement_prompt
         )
     def refine_query(self, original_query, context_hints=''):
         try:
-            refined_query = self.refinement_chain.run({
                 'query': original_query,
-                'context': context_hints or "General academic document"
-            })
-            return refined_query.strip()
         except Exception as e:
             logger.error(f"Query refinement error: {e}")
             return original_query
@@ -48,124 +66,108 @@ Refined Query:"""
 class AdvancedPdfChatbot:
     def __init__(self, openai_api_key):
         os.environ["OPENAI_API_KEY"] = openai_api_key
         self.embeddings = OpenAIEmbeddings()
-        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-        self.llm = ChatOpenAI(temperature=0, model_name='gpt-4o')
         self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
-        self.query_refiner = QueryRefiner()
         self.db = None
         self.chain = None
-        self.document_metadata = {}  # Store extracted document metadata
-        self.qa_prompt = PromptTemplate(
-            template="""You are an expert academic assistant analyzing a document. Provide well structured response in Markdown
-Context: {context}
-Question: {question}
-Provide a comprehensive, precise answer based strictly on the document's content.
-Use this different formats for different contexts:
-example format 1:
-- Short summary of the response with a relevant title
-- Headlines and bullet points with descriptions with breakdowns of each topic and details
-- Conclusion
-example format 2:
-Precise pragraph with headlines and a paragraph
-example format 3:
-Numbered bullet points or ordered lists
-Use more such formats to suit the user given context
-NOTE: Give precise and short answers when asked about specific terms and summaries of specific topics.
-If the answer isn't directly available, explain why. """,
-            input_variables=["context", "question"]
         )
     def load_and_process_pdf(self, pdf_path):
         try:
-            self._extract_pdf_metadata(pdf_path)  # Extract metadata (title, author, etc.)
             loader = PyPDFLoader(pdf_path)
             documents = loader.load()
             texts = self.text_splitter.split_documents(documents)
-            self.db = FAISS.from_documents(texts, self.embeddings)
             self.chain = ConversationalRetrievalChain.from_llm(
                 llm=self.llm,
                 retriever=self.db.as_retriever(search_kwargs={"k": 3}),
-                memory=self.memory,
-                combine_docs_chain_kwargs={"prompt": self.qa_prompt}
             )
-            # Extract document context and store it in memory
-            document_context = self._extract_document_type()
-            logger.info(f"Extracted document context: {document_context}")
-            # Save document context in memory properly
-            self.memory.save_context({"input": "System"}, {"output": f"Document context: {document_context}"})
         except Exception as e:
             logger.error(f"PDF processing error: {e}")
-            raise e
     def chat(self, query):
         if not self.chain:
-            return "Please upload a PDF first."
-        context_hints = self._extract_document_type()
-        refined_query = self.query_refiner.refine_query(query, context_hints)
-        print(refined_query,context_hints)
         result = self.chain({"question": refined_query})
         return result['answer']
-    def _extract_document_type(self):
-        """Extract detailed document characteristics"""
-        if not self.db:
-            return "No document loaded"
-        try:
-            first_doc = list(self.db.docstore._dict.values())[0].page_content[:1000]
-            headings = self._extract_headings(first_doc)
-            context_details = {
-                "Title": self.document_metadata.get('title', 'Unknown Title'),
-                "Author": self.document_metadata.get('author', 'Unknown Author'),
-                "First Snippet": first_doc[:300],
-                "Headings": headings
-            }
-            context_str = f"Title: {context_details['Title']}, Author: {context_details['Author']}, Headings: {context_details['Headings']}"
-            return context_str
-        except Exception as e:
-            logger.error(f"Error extracting document type: {e}")
-            return "Academic/technical document"
-    def _extract_pdf_metadata(self, pdf_path):
-        """Extract metadata like title, author, and creation date"""
-        try:
-            reader = PdfReader(pdf_path)
-            self.document_metadata = {
-                "title": reader.metadata.get("/Title", "Unknown Title"),
-                "author": reader.metadata.get("/Author", "Unknown Author"),
-                "creation_date": reader.metadata.get("/CreationDate", "Unknown Date")
-            }
-            logger.info(f"Extracted PDF metadata: {self.document_metadata}")
-        except Exception as e:
-            logger.error(f"Error extracting PDF metadata: {e}")
-            self.document_metadata = {}
-    def _extract_headings(self, text):
-        """Extract headings from the first document's content"""
-        try:
-            headings = [line for line in text.split("\n") if line.strip().istitle()]
-            return ', '.join(headings[:5])  # Return the first 5 headings
-        except Exception as e:
-            logger.error(f"Error extracting headings: {e}")
-            return "No headings found"
-    def clear_memory(self):
-        self.memory.clear()
 # Gradio Interface
 pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY"))

 from langchain.chains import ConversationalRetrievalChain, LLMChain
 from langchain.memory import ConversationBufferMemory
 from langchain.prompts import PromptTemplate
+from PyPDF2 import PdfReader
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+class ResponseStructureSelector:
+    def __init__(self, llm):
+        self.llm = llm
+        self.structure_prompt = PromptTemplate(
+            input_variables=['context', 'query'],
+            template="""Analyze the context and query to determine the most appropriate response structure:
+Context: {context}
+Query: {query}
+Select the optimal response format:
+1. Markdown with bullet points and headlines
+2. Concise paragraph with key insights
+3. Numbered list with detailed explanations
+4. Technical breakdown with subheadings
+5. Quick summary with critical points
+Choose the number (1-5) of the most suitable format:"""
+        )
+        self.structure_chain = LLMChain(llm=self.llm, prompt=self.structure_prompt)
+    def select_structure(self, context, query):
+        try:
+            structure_choice = self.structure_chain.run({'context': context, 'query': query})
+            return int(structure_choice.strip())
+        except:
+            return 1  # Default to Markdown structure
 class QueryRefiner:
+    def __init__(self, llm):
+        self.refinement_llm = llm
         self.refinement_prompt = PromptTemplate(
             input_variables=['query', 'context'],
+            template="""Refine query for clarity and precision:
 Original Query: {query}
 Document Context: {context}
+Refined, Focused Query:"""
         )
+        self.refinement_chain = LLMChain(llm=self.refinement_llm, prompt=self.refinement_prompt)
     def refine_query(self, original_query, context_hints=''):
         try:
+            return self.refinement_chain.run({
                 'query': original_query,
+                'context': context_hints or "General document"
+            }).strip()
         except Exception as e:
             logger.error(f"Query refinement error: {e}")
             return original_query
 class AdvancedPdfChatbot:
     def __init__(self, openai_api_key):
         os.environ["OPENAI_API_KEY"] = openai_api_key
+        self.llm = ChatOpenAI(temperature=0, model_name='gpt-4o', max_tokens=1000)
         self.embeddings = OpenAIEmbeddings()
+        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
         self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+        self.query_refiner = QueryRefiner(self.llm)
+        self.response_selector = ResponseStructureSelector(self.llm)
         self.db = None
         self.chain = None
+        self.document_metadata = {}
+    def _create_response_prompt(self, structure_choice):
+        structure_templates = {
+            1: """Markdown Response with Structured Insights:
+## {title}
+### Key Highlights
+{content}
+### Conclusion
+{conclusion}""",
+            2: """{title}: {content}. Key Takeaway: {conclusion}""",
+            3: """Structured Breakdown:
+1. {title}
+   - Main Point: {content}
+2. Implications
+   - {conclusion}""",
+            4: """Technical Analysis
+## {title}
+### Core Concept
+{content}
+### Technical Implications
+{conclusion}""",
+            5: """Concise Summary: {title}. Key Points: {content}. Conclusion: {conclusion}."""
+        }
+        return PromptTemplate(
+            template=structure_templates.get(structure_choice, structure_templates[1]),
+            input_variables=["title", "content", "conclusion"]
         )
     def load_and_process_pdf(self, pdf_path):
         try:
+            # Extract PDF metadata
+            reader = PdfReader(pdf_path)
+            self.document_metadata = {
+                "title": reader.metadata.get("/Title", "Untitled Document"),
+                "author": reader.metadata.get("/Author", "Unknown")
+            }
+            # Load and process PDF
             loader = PyPDFLoader(pdf_path)
             documents = loader.load()
             texts = self.text_splitter.split_documents(documents)
+            # Create vector store with fewer documents to improve performance
+            self.db = FAISS.from_documents(texts[:30], self.embeddings)
+            # Setup conversational chain
             self.chain = ConversationalRetrievalChain.from_llm(
                 llm=self.llm,
                 retriever=self.db.as_retriever(search_kwargs={"k": 3}),
+                memory=self.memory
             )
+            return True
         except Exception as e:
             logger.error(f"PDF processing error: {e}")
+            return False
     def chat(self, query):
         if not self.chain:
+            return "Upload a PDF first."
+        # Refine query
+        context = f"Document: {self.document_metadata.get('title', 'Unknown')}"
+        refined_query = self.query_refiner.refine_query(query, context)
+        # Select response structure
+        structure_choice = self.response_selector.select_structure(context, refined_query)
+        # Perform retrieval and answer generation
         result = self.chain({"question": refined_query})
         return result['answer']
+# Gradio Interface (remains mostly the same)
+pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY"))
+def upload_pdf(pdf_file):
+    if not pdf_file:
+        return "Upload a PDF file."
+    file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
+    return "PDF processed successfully" if pdf_chatbot.load_and_process_pdf(file_path) else "Processing failed"
+def respond(message, history):
+    try:
+        bot_message = pdf_chatbot.chat(message)
+        history.append((message, bot_message))
+        return "", history
+    except Exception as e:
+        return f"Error: {e}", history
 # Gradio Interface
 pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY"))