Pavan178 commited on
Commit
7f36a98
·
verified ·
1 Parent(s): 0b367de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -30
app.py CHANGED
@@ -11,6 +11,7 @@ from langchain.memory import ConversationBufferMemory
11
  from langchain.prompts import PromptTemplate
12
  import concurrent.futures
13
  import timeout_decorator
 
14
 
15
  # Configure logging
16
  logging.basicConfig(level=logging.INFO)
@@ -22,15 +23,12 @@ class QueryRefiner:
22
  self.refinement_prompt = PromptTemplate(
23
  input_variables=['query', 'context'],
24
  template="""Refine and enhance the following query for maximum clarity and precision:
25
-
26
  Original Query: {query}
27
  Document Context: {context}
28
-
29
  Enhanced Query Requirements:
30
  - Restructure for optimal comprehension
31
- - rewrite the original query for best comprehension for getting all the details in great attention to details
32
- - Use specific structure and the response be according to context such as paragraphs or bullet points, headlines and subtexts
33
-
34
  Refined Query:"""
35
  )
36
  self.refinement_chain = LLMChain(
@@ -38,7 +36,6 @@ Refined Query:"""
38
  prompt=self.refinement_prompt
39
  )
40
 
41
-
42
  def refine_query(self, original_query, context_hints=''):
43
  try:
44
  refined_query = self.refinement_chain.run({
@@ -61,39 +58,45 @@ class AdvancedPdfChatbot:
61
  self.query_refiner = QueryRefiner()
62
  self.db = None
63
  self.chain = None
 
64
 
65
  self.qa_prompt = PromptTemplate(
66
  template="""You are an expert academic assistant analyzing a document. Provide well structured response in Markdown
67
-
68
  Context: {context}
69
  Question: {question}
70
-
71
  Provide a comprehensive, precise answer based strictly on the document's content.
72
  Use this format:
73
  - Short summary of the response with a relevant title
74
- - Headlines and bullet points with descriptions with breakdowns of each topics and details
75
  - Conclusion
76
-
77
- NOTE: Give precise and short answers when asked about specific terms and summary of specific topic
78
-
79
  If the answer isn't directly available, explain why. """,
80
  input_variables=["context", "question"]
81
  )
82
 
83
-
84
  def load_and_process_pdf(self, pdf_path):
85
- loader = PyPDFLoader(pdf_path)
86
- documents = loader.load()
87
- texts = self.text_splitter.split_documents(documents)
88
- self.db = FAISS.from_documents(texts, self.embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- self.chain = ConversationalRetrievalChain.from_llm(
91
- llm=self.llm,
92
- retriever=self.db.as_retriever(search_kwargs={"k": 3}),
93
- memory=self.memory,
94
- combine_docs_chain_kwargs={"prompt": self.qa_prompt}
95
- )
96
-
97
 
98
  def chat(self, query):
99
  if not self.chain:
@@ -106,15 +109,48 @@ If the answer isn't directly available, explain why. """,
106
  return result['answer']
107
 
108
  def _extract_document_type(self):
109
- """Extract basic document characteristics"""
110
  if not self.db:
111
- return ""
112
  try:
113
- first_doc = list(self.db.docstore._dict.values())[0].page_content[:500]
114
- return f"Document appears to cover: {first_doc[:100]}..."
115
- except:
 
 
 
 
 
 
 
 
 
116
  return "Academic/technical document"
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def clear_memory(self):
119
  self.memory.clear()
120
 
@@ -166,4 +202,4 @@ with gr.Blocks() as demo:
166
  clear_button.click(clear_chatbot, outputs=[chatbot_interface])
167
 
168
  if __name__ == "__main__":
169
- demo.launch()
 
11
  from langchain.prompts import PromptTemplate
12
  import concurrent.futures
13
  import timeout_decorator
14
+ from PyPDF2 import PdfReader # New import for PDF metadata extraction
15
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO)
 
23
  self.refinement_prompt = PromptTemplate(
24
  input_variables=['query', 'context'],
25
  template="""Refine and enhance the following query for maximum clarity and precision:
 
26
  Original Query: {query}
27
  Document Context: {context}
 
28
  Enhanced Query Requirements:
29
  - Restructure for optimal comprehension
30
+ - Rewrite the original query for the best comprehension and attention to detail
31
+ - Use specific structure; response should include paragraphs, bullet points, headlines, and subtexts
 
32
  Refined Query:"""
33
  )
34
  self.refinement_chain = LLMChain(
 
36
  prompt=self.refinement_prompt
37
  )
38
 
 
39
  def refine_query(self, original_query, context_hints=''):
40
  try:
41
  refined_query = self.refinement_chain.run({
 
58
  self.query_refiner = QueryRefiner()
59
  self.db = None
60
  self.chain = None
61
+ self.document_metadata = {} # Store extracted document metadata
62
 
63
  self.qa_prompt = PromptTemplate(
64
  template="""You are an expert academic assistant analyzing a document. Provide well structured response in Markdown
 
65
  Context: {context}
66
  Question: {question}
 
67
  Provide a comprehensive, precise answer based strictly on the document's content.
68
  Use this format:
69
  - Short summary of the response with a relevant title
70
+ - Headlines and bullet points with descriptions with breakdowns of each topic and details
71
  - Conclusion
72
+ NOTE: Give precise and short answers when asked about specific terms and summaries of specific topics.
 
 
73
  If the answer isn't directly available, explain why. """,
74
  input_variables=["context", "question"]
75
  )
76
 
 
77
  def load_and_process_pdf(self, pdf_path):
78
+ try:
79
+ self._extract_pdf_metadata(pdf_path) # Extract metadata (title, author, etc.)
80
+ loader = PyPDFLoader(pdf_path)
81
+ documents = loader.load()
82
+ texts = self.text_splitter.split_documents(documents)
83
+ self.db = FAISS.from_documents(texts, self.embeddings)
84
+
85
+ self.chain = ConversationalRetrievalChain.from_llm(
86
+ llm=self.llm,
87
+ retriever=self.db.as_retriever(search_kwargs={"k": 3}),
88
+ memory=self.memory,
89
+ combine_docs_chain_kwargs={"prompt": self.qa_prompt}
90
+ )
91
+
92
+ # Extract document context and store it in memory
93
+ document_context = self._extract_document_type()
94
+ logger.info(f"Extracted document context: {document_context}")
95
+ self.memory.chat_history.append(("System", f"Document context: {document_context}"))
96
 
97
+ except Exception as e:
98
+ logger.error(f"PDF processing error: {e}")
99
+ raise e
 
 
 
 
100
 
101
  def chat(self, query):
102
  if not self.chain:
 
109
  return result['answer']
110
 
111
  def _extract_document_type(self):
112
+ """Extract detailed document characteristics"""
113
  if not self.db:
114
+ return "No document loaded"
115
  try:
116
+ first_doc = list(self.db.docstore._dict.values())[0].page_content[:1000]
117
+ headings = self._extract_headings(first_doc)
118
+ context_details = {
119
+ "Title": self.document_metadata.get('title', 'Unknown Title'),
120
+ "Author": self.document_metadata.get('author', 'Unknown Author'),
121
+ "First Snippet": first_doc[:300],
122
+ "Headings": headings
123
+ }
124
+ context_str = f"Title: {context_details['Title']}, Author: {context_details['Author']}, Headings: {context_details['Headings']}"
125
+ return context_str
126
+ except Exception as e:
127
+ logger.error(f"Error extracting document type: {e}")
128
  return "Academic/technical document"
129
 
130
+ def _extract_pdf_metadata(self, pdf_path):
131
+ """Extract metadata like title, author, and creation date"""
132
+ try:
133
+ reader = PdfReader(pdf_path)
134
+ self.document_metadata = {
135
+ "title": reader.metadata.get("/Title", "Unknown Title"),
136
+ "author": reader.metadata.get("/Author", "Unknown Author"),
137
+ "creation_date": reader.metadata.get("/CreationDate", "Unknown Date")
138
+ }
139
+ logger.info(f"Extracted PDF metadata: {self.document_metadata}")
140
+ except Exception as e:
141
+ logger.error(f"Error extracting PDF metadata: {e}")
142
+ self.document_metadata = {}
143
+
144
+ def _extract_headings(self, text):
145
+ """Extract headings from the first document's content"""
146
+ try:
147
+ # Simple heuristic: Extract lines with uppercase or title-case words (like headings)
148
+ headings = [line for line in text.split("\n") if line.strip().istitle()]
149
+ return ', '.join(headings[:5]) # Return the first 5 headings
150
+ except Exception as e:
151
+ logger.error(f"Error extracting headings: {e}")
152
+ return "No headings found"
153
+
154
  def clear_memory(self):
155
  self.memory.clear()
156
 
 
202
  clear_button.click(clear_chatbot, outputs=[chatbot_interface])
203
 
204
  if __name__ == "__main__":
205
+ demo.launch()