Pavan178 commited on
Commit
75fd4bb
·
verified ·
1 Parent(s): d78dd14

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -106
app.py CHANGED
@@ -9,38 +9,56 @@ from langchain.chat_models import ChatOpenAI
9
  from langchain.chains import ConversationalRetrievalChain, LLMChain
10
  from langchain.memory import ConversationBufferMemory
11
  from langchain.prompts import PromptTemplate
 
12
 
13
- from PyPDF2 import PdfReader # New import for PDF metadata extraction
14
-
15
- # Configure logging
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  class QueryRefiner:
20
- def __init__(self):
21
- self.refinement_llm = ChatOpenAI(temperature=0.2, model_name='gpt-4o')
22
  self.refinement_prompt = PromptTemplate(
23
  input_variables=['query', 'context'],
24
- template="""Refine and enhance the following query for maximum clarity and precision:
25
  Original Query: {query}
26
  Document Context: {context}
27
- Enhanced Query Requirements:
28
- - Restructure for optimal comprehension
29
- - Rewrite the question to the best context and structure of output desired
30
- Refined Query:"""
31
- )
32
- self.refinement_chain = LLMChain(
33
- llm=self.refinement_llm,
34
- prompt=self.refinement_prompt
35
  )
 
36
 
37
  def refine_query(self, original_query, context_hints=''):
38
  try:
39
- refined_query = self.refinement_chain.run({
40
  'query': original_query,
41
- 'context': context_hints or "General academic document"
42
- })
43
- return refined_query.strip()
44
  except Exception as e:
45
  logger.error(f"Query refinement error: {e}")
46
  return original_query
@@ -48,124 +66,108 @@ Refined Query:"""
48
  class AdvancedPdfChatbot:
49
  def __init__(self, openai_api_key):
50
  os.environ["OPENAI_API_KEY"] = openai_api_key
 
 
51
  self.embeddings = OpenAIEmbeddings()
52
- self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
53
- self.llm = ChatOpenAI(temperature=0, model_name='gpt-4o')
54
 
55
  self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
56
- self.query_refiner = QueryRefiner()
 
 
57
  self.db = None
58
  self.chain = None
59
- self.document_metadata = {} # Store extracted document metadata
60
-
61
- self.qa_prompt = PromptTemplate(
62
- template="""You are an expert academic assistant analyzing a document. Provide well structured response in Markdown
63
- Context: {context}
64
- Question: {question}
65
- Provide a comprehensive, precise answer based strictly on the document's content.
66
-
67
-
68
- Use this different formats for different contexts:
69
-
70
- example format 1:
71
- - Short summary of the response with a relevant title
72
- - Headlines and bullet points with descriptions with breakdowns of each topic and details
73
- - Conclusion
74
-
75
- example format 2:
76
-
77
- Precise pragraph with headlines and a paragraph
78
-
79
- example format 3:
80
- Numbered bullet points or ordered lists
81
-
82
- Use more such formats to suit the user given context
83
-
84
- NOTE: Give precise and short answers when asked about specific terms and summaries of specific topics.
85
- If the answer isn't directly available, explain why. """,
86
- input_variables=["context", "question"]
87
  )
88
 
89
  def load_and_process_pdf(self, pdf_path):
90
  try:
91
- self._extract_pdf_metadata(pdf_path) # Extract metadata (title, author, etc.)
 
 
 
 
 
 
 
92
  loader = PyPDFLoader(pdf_path)
93
  documents = loader.load()
94
  texts = self.text_splitter.split_documents(documents)
95
- self.db = FAISS.from_documents(texts, self.embeddings)
96
 
 
 
 
 
97
  self.chain = ConversationalRetrievalChain.from_llm(
98
  llm=self.llm,
99
  retriever=self.db.as_retriever(search_kwargs={"k": 3}),
100
- memory=self.memory,
101
- combine_docs_chain_kwargs={"prompt": self.qa_prompt}
102
  )
103
 
104
- # Extract document context and store it in memory
105
- document_context = self._extract_document_type()
106
- logger.info(f"Extracted document context: {document_context}")
107
-
108
- # Save document context in memory properly
109
- self.memory.save_context({"input": "System"}, {"output": f"Document context: {document_context}"})
110
-
111
  except Exception as e:
112
  logger.error(f"PDF processing error: {e}")
113
- raise e
114
 
115
  def chat(self, query):
116
  if not self.chain:
117
- return "Please upload a PDF first."
 
 
 
 
118
 
119
- context_hints = self._extract_document_type()
120
- refined_query = self.query_refiner.refine_query(query, context_hints)
121
- print(refined_query,context_hints)
 
122
  result = self.chain({"question": refined_query})
 
123
  return result['answer']
124
 
125
- def _extract_document_type(self):
126
- """Extract detailed document characteristics"""
127
- if not self.db:
128
- return "No document loaded"
129
- try:
130
- first_doc = list(self.db.docstore._dict.values())[0].page_content[:1000]
131
- headings = self._extract_headings(first_doc)
132
- context_details = {
133
- "Title": self.document_metadata.get('title', 'Unknown Title'),
134
- "Author": self.document_metadata.get('author', 'Unknown Author'),
135
- "First Snippet": first_doc[:300],
136
- "Headings": headings
137
- }
138
- context_str = f"Title: {context_details['Title']}, Author: {context_details['Author']}, Headings: {context_details['Headings']}"
139
- return context_str
140
- except Exception as e:
141
- logger.error(f"Error extracting document type: {e}")
142
- return "Academic/technical document"
143
 
144
- def _extract_pdf_metadata(self, pdf_path):
145
- """Extract metadata like title, author, and creation date"""
146
- try:
147
- reader = PdfReader(pdf_path)
148
- self.document_metadata = {
149
- "title": reader.metadata.get("/Title", "Unknown Title"),
150
- "author": reader.metadata.get("/Author", "Unknown Author"),
151
- "creation_date": reader.metadata.get("/CreationDate", "Unknown Date")
152
- }
153
- logger.info(f"Extracted PDF metadata: {self.document_metadata}")
154
- except Exception as e:
155
- logger.error(f"Error extracting PDF metadata: {e}")
156
- self.document_metadata = {}
157
 
158
- def _extract_headings(self, text):
159
- """Extract headings from the first document's content"""
160
- try:
161
- headings = [line for line in text.split("\n") if line.strip().istitle()]
162
- return ', '.join(headings[:5]) # Return the first 5 headings
163
- except Exception as e:
164
- logger.error(f"Error extracting headings: {e}")
165
- return "No headings found"
166
 
167
- def clear_memory(self):
168
- self.memory.clear()
169
 
170
  # Gradio Interface
171
  pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY"))
 
9
  from langchain.chains import ConversationalRetrievalChain, LLMChain
10
  from langchain.memory import ConversationBufferMemory
11
  from langchain.prompts import PromptTemplate
12
+ from PyPDF2 import PdfReader
13
 
 
 
 
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
16
 
17
+ class ResponseStructureSelector:
18
+ def __init__(self, llm):
19
+ self.llm = llm
20
+ self.structure_prompt = PromptTemplate(
21
+ input_variables=['context', 'query'],
22
+ template="""Analyze the context and query to determine the most appropriate response structure:
23
+ Context: {context}
24
+ Query: {query}
25
+
26
+ Select the optimal response format:
27
+ 1. Markdown with bullet points and headlines
28
+ 2. Concise paragraph with key insights
29
+ 3. Numbered list with detailed explanations
30
+ 4. Technical breakdown with subheadings
31
+ 5. Quick summary with critical points
32
+
33
+ Choose the number (1-5) of the most suitable format:"""
34
+ )
35
+ self.structure_chain = LLMChain(llm=self.llm, prompt=self.structure_prompt)
36
+
37
+ def select_structure(self, context, query):
38
+ try:
39
+ structure_choice = self.structure_chain.run({'context': context, 'query': query})
40
+ return int(structure_choice.strip())
41
+ except:
42
+ return 1 # Default to Markdown structure
43
+
44
  class QueryRefiner:
45
+ def __init__(self, llm):
46
+ self.refinement_llm = llm
47
  self.refinement_prompt = PromptTemplate(
48
  input_variables=['query', 'context'],
49
+ template="""Refine query for clarity and precision:
50
  Original Query: {query}
51
  Document Context: {context}
52
+ Refined, Focused Query:"""
 
 
 
 
 
 
 
53
  )
54
+ self.refinement_chain = LLMChain(llm=self.refinement_llm, prompt=self.refinement_prompt)
55
 
56
  def refine_query(self, original_query, context_hints=''):
57
  try:
58
+ return self.refinement_chain.run({
59
  'query': original_query,
60
+ 'context': context_hints or "General document"
61
+ }).strip()
 
62
  except Exception as e:
63
  logger.error(f"Query refinement error: {e}")
64
  return original_query
 
66
  class AdvancedPdfChatbot:
67
  def __init__(self, openai_api_key):
68
  os.environ["OPENAI_API_KEY"] = openai_api_key
69
+ self.llm = ChatOpenAI(temperature=0, model_name='gpt-4o', max_tokens=1000)
70
+
71
  self.embeddings = OpenAIEmbeddings()
72
+ self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
 
73
 
74
  self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
75
+ self.query_refiner = QueryRefiner(self.llm)
76
+ self.response_selector = ResponseStructureSelector(self.llm)
77
+
78
  self.db = None
79
  self.chain = None
80
+ self.document_metadata = {}
81
+
82
+ def _create_response_prompt(self, structure_choice):
83
+ structure_templates = {
84
+ 1: """Markdown Response with Structured Insights:
85
+ ## {title}
86
+ ### Key Highlights
87
+ {content}
88
+ ### Conclusion
89
+ {conclusion}""",
90
+ 2: """{title}: {content}. Key Takeaway: {conclusion}""",
91
+ 3: """Structured Breakdown:
92
+ 1. {title}
93
+ - Main Point: {content}
94
+ 2. Implications
95
+ - {conclusion}""",
96
+ 4: """Technical Analysis
97
+ ## {title}
98
+ ### Core Concept
99
+ {content}
100
+ ### Technical Implications
101
+ {conclusion}""",
102
+ 5: """Concise Summary: {title}. Key Points: {content}. Conclusion: {conclusion}."""
103
+ }
104
+ return PromptTemplate(
105
+ template=structure_templates.get(structure_choice, structure_templates[1]),
106
+ input_variables=["title", "content", "conclusion"]
 
107
  )
108
 
109
  def load_and_process_pdf(self, pdf_path):
110
  try:
111
+ # Extract PDF metadata
112
+ reader = PdfReader(pdf_path)
113
+ self.document_metadata = {
114
+ "title": reader.metadata.get("/Title", "Untitled Document"),
115
+ "author": reader.metadata.get("/Author", "Unknown")
116
+ }
117
+
118
+ # Load and process PDF
119
  loader = PyPDFLoader(pdf_path)
120
  documents = loader.load()
121
  texts = self.text_splitter.split_documents(documents)
 
122
 
123
+ # Create vector store with fewer documents to improve performance
124
+ self.db = FAISS.from_documents(texts[:30], self.embeddings)
125
+
126
+ # Setup conversational chain
127
  self.chain = ConversationalRetrievalChain.from_llm(
128
  llm=self.llm,
129
  retriever=self.db.as_retriever(search_kwargs={"k": 3}),
130
+ memory=self.memory
 
131
  )
132
 
133
+ return True
 
 
 
 
 
 
134
  except Exception as e:
135
  logger.error(f"PDF processing error: {e}")
136
+ return False
137
 
138
  def chat(self, query):
139
  if not self.chain:
140
+ return "Upload a PDF first."
141
+
142
+ # Refine query
143
+ context = f"Document: {self.document_metadata.get('title', 'Unknown')}"
144
+ refined_query = self.query_refiner.refine_query(query, context)
145
 
146
+ # Select response structure
147
+ structure_choice = self.response_selector.select_structure(context, refined_query)
148
+
149
+ # Perform retrieval and answer generation
150
  result = self.chain({"question": refined_query})
151
+
152
  return result['answer']
153
 
154
+ # Gradio Interface (remains mostly the same)
155
+ pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
+ def upload_pdf(pdf_file):
158
+ if not pdf_file:
159
+ return "Upload a PDF file."
160
+ file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
161
+ return "PDF processed successfully" if pdf_chatbot.load_and_process_pdf(file_path) else "Processing failed"
 
 
 
 
 
 
 
 
162
 
163
+ def respond(message, history):
164
+ try:
165
+ bot_message = pdf_chatbot.chat(message)
166
+ history.append((message, bot_message))
167
+ return "", history
168
+ except Exception as e:
169
+ return f"Error: {e}", history
 
170
 
 
 
171
 
172
  # Gradio Interface
173
  pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY"))