Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,7 @@ from langchain.memory import ConversationBufferMemory
|
|
11 |
from langchain.prompts import PromptTemplate
|
12 |
import concurrent.futures
|
13 |
import timeout_decorator
|
|
|
14 |
|
15 |
# Configure logging
|
16 |
logging.basicConfig(level=logging.INFO)
|
@@ -22,15 +23,12 @@ class QueryRefiner:
|
|
22 |
self.refinement_prompt = PromptTemplate(
|
23 |
input_variables=['query', 'context'],
|
24 |
template="""Refine and enhance the following query for maximum clarity and precision:
|
25 |
-
|
26 |
Original Query: {query}
|
27 |
Document Context: {context}
|
28 |
-
|
29 |
Enhanced Query Requirements:
|
30 |
- Restructure for optimal comprehension
|
31 |
-
-
|
32 |
-
- Use specific structure
|
33 |
-
|
34 |
Refined Query:"""
|
35 |
)
|
36 |
self.refinement_chain = LLMChain(
|
@@ -38,7 +36,6 @@ Refined Query:"""
|
|
38 |
prompt=self.refinement_prompt
|
39 |
)
|
40 |
|
41 |
-
|
42 |
def refine_query(self, original_query, context_hints=''):
|
43 |
try:
|
44 |
refined_query = self.refinement_chain.run({
|
@@ -61,39 +58,45 @@ class AdvancedPdfChatbot:
|
|
61 |
self.query_refiner = QueryRefiner()
|
62 |
self.db = None
|
63 |
self.chain = None
|
|
|
64 |
|
65 |
self.qa_prompt = PromptTemplate(
|
66 |
template="""You are an expert academic assistant analyzing a document. Provide well structured response in Markdown
|
67 |
-
|
68 |
Context: {context}
|
69 |
Question: {question}
|
70 |
-
|
71 |
Provide a comprehensive, precise answer based strictly on the document's content.
|
72 |
Use this format:
|
73 |
- Short summary of the response with a relevant title
|
74 |
-
- Headlines and bullet points with descriptions with breakdowns of each
|
75 |
- Conclusion
|
76 |
-
|
77 |
-
NOTE: Give precise and short answers when asked about specific terms and summary of specific topic
|
78 |
-
|
79 |
If the answer isn't directly available, explain why. """,
|
80 |
input_variables=["context", "question"]
|
81 |
)
|
82 |
|
83 |
-
|
84 |
def load_and_process_pdf(self, pdf_path):
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
memory=self.memory,
|
94 |
-
combine_docs_chain_kwargs={"prompt": self.qa_prompt}
|
95 |
-
)
|
96 |
-
|
97 |
|
98 |
def chat(self, query):
|
99 |
if not self.chain:
|
@@ -106,15 +109,48 @@ If the answer isn't directly available, explain why. """,
|
|
106 |
return result['answer']
|
107 |
|
108 |
def _extract_document_type(self):
|
109 |
-
"""Extract
|
110 |
if not self.db:
|
111 |
-
return ""
|
112 |
try:
|
113 |
-
first_doc = list(self.db.docstore._dict.values())[0].page_content[:
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
return "Academic/technical document"
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
def clear_memory(self):
|
119 |
self.memory.clear()
|
120 |
|
@@ -166,4 +202,4 @@ with gr.Blocks() as demo:
|
|
166 |
clear_button.click(clear_chatbot, outputs=[chatbot_interface])
|
167 |
|
168 |
if __name__ == "__main__":
|
169 |
-
demo.launch()
|
|
|
11 |
from langchain.prompts import PromptTemplate
|
12 |
import concurrent.futures
|
13 |
import timeout_decorator
|
14 |
+
from PyPDF2 import PdfReader # New import for PDF metadata extraction
|
15 |
|
16 |
# Configure logging
|
17 |
logging.basicConfig(level=logging.INFO)
|
|
|
23 |
self.refinement_prompt = PromptTemplate(
|
24 |
input_variables=['query', 'context'],
|
25 |
template="""Refine and enhance the following query for maximum clarity and precision:
|
|
|
26 |
Original Query: {query}
|
27 |
Document Context: {context}
|
|
|
28 |
Enhanced Query Requirements:
|
29 |
- Restructure for optimal comprehension
|
30 |
+
- Rewrite the original query for the best comprehension and attention to detail
|
31 |
+
- Use specific structure; response should include paragraphs, bullet points, headlines, and subtexts
|
|
|
32 |
Refined Query:"""
|
33 |
)
|
34 |
self.refinement_chain = LLMChain(
|
|
|
36 |
prompt=self.refinement_prompt
|
37 |
)
|
38 |
|
|
|
39 |
def refine_query(self, original_query, context_hints=''):
|
40 |
try:
|
41 |
refined_query = self.refinement_chain.run({
|
|
|
58 |
self.query_refiner = QueryRefiner()
|
59 |
self.db = None
|
60 |
self.chain = None
|
61 |
+
self.document_metadata = {} # Store extracted document metadata
|
62 |
|
63 |
self.qa_prompt = PromptTemplate(
|
64 |
template="""You are an expert academic assistant analyzing a document. Provide well structured response in Markdown
|
|
|
65 |
Context: {context}
|
66 |
Question: {question}
|
|
|
67 |
Provide a comprehensive, precise answer based strictly on the document's content.
|
68 |
Use this format:
|
69 |
- Short summary of the response with a relevant title
|
70 |
+
- Headlines and bullet points with descriptions with breakdowns of each topic and details
|
71 |
- Conclusion
|
72 |
+
NOTE: Give precise and short answers when asked about specific terms and summaries of specific topics.
|
|
|
|
|
73 |
If the answer isn't directly available, explain why. """,
|
74 |
input_variables=["context", "question"]
|
75 |
)
|
76 |
|
|
|
77 |
def load_and_process_pdf(self, pdf_path):
|
78 |
+
try:
|
79 |
+
self._extract_pdf_metadata(pdf_path) # Extract metadata (title, author, etc.)
|
80 |
+
loader = PyPDFLoader(pdf_path)
|
81 |
+
documents = loader.load()
|
82 |
+
texts = self.text_splitter.split_documents(documents)
|
83 |
+
self.db = FAISS.from_documents(texts, self.embeddings)
|
84 |
+
|
85 |
+
self.chain = ConversationalRetrievalChain.from_llm(
|
86 |
+
llm=self.llm,
|
87 |
+
retriever=self.db.as_retriever(search_kwargs={"k": 3}),
|
88 |
+
memory=self.memory,
|
89 |
+
combine_docs_chain_kwargs={"prompt": self.qa_prompt}
|
90 |
+
)
|
91 |
+
|
92 |
+
# Extract document context and store it in memory
|
93 |
+
document_context = self._extract_document_type()
|
94 |
+
logger.info(f"Extracted document context: {document_context}")
|
95 |
+
self.memory.chat_history.append(("System", f"Document context: {document_context}"))
|
96 |
|
97 |
+
except Exception as e:
|
98 |
+
logger.error(f"PDF processing error: {e}")
|
99 |
+
raise e
|
|
|
|
|
|
|
|
|
100 |
|
101 |
def chat(self, query):
|
102 |
if not self.chain:
|
|
|
109 |
return result['answer']
|
110 |
|
111 |
def _extract_document_type(self):
|
112 |
+
"""Extract detailed document characteristics"""
|
113 |
if not self.db:
|
114 |
+
return "No document loaded"
|
115 |
try:
|
116 |
+
first_doc = list(self.db.docstore._dict.values())[0].page_content[:1000]
|
117 |
+
headings = self._extract_headings(first_doc)
|
118 |
+
context_details = {
|
119 |
+
"Title": self.document_metadata.get('title', 'Unknown Title'),
|
120 |
+
"Author": self.document_metadata.get('author', 'Unknown Author'),
|
121 |
+
"First Snippet": first_doc[:300],
|
122 |
+
"Headings": headings
|
123 |
+
}
|
124 |
+
context_str = f"Title: {context_details['Title']}, Author: {context_details['Author']}, Headings: {context_details['Headings']}"
|
125 |
+
return context_str
|
126 |
+
except Exception as e:
|
127 |
+
logger.error(f"Error extracting document type: {e}")
|
128 |
return "Academic/technical document"
|
129 |
|
130 |
+
def _extract_pdf_metadata(self, pdf_path):
|
131 |
+
"""Extract metadata like title, author, and creation date"""
|
132 |
+
try:
|
133 |
+
reader = PdfReader(pdf_path)
|
134 |
+
self.document_metadata = {
|
135 |
+
"title": reader.metadata.get("/Title", "Unknown Title"),
|
136 |
+
"author": reader.metadata.get("/Author", "Unknown Author"),
|
137 |
+
"creation_date": reader.metadata.get("/CreationDate", "Unknown Date")
|
138 |
+
}
|
139 |
+
logger.info(f"Extracted PDF metadata: {self.document_metadata}")
|
140 |
+
except Exception as e:
|
141 |
+
logger.error(f"Error extracting PDF metadata: {e}")
|
142 |
+
self.document_metadata = {}
|
143 |
+
|
144 |
+
def _extract_headings(self, text):
|
145 |
+
"""Extract headings from the first document's content"""
|
146 |
+
try:
|
147 |
+
# Simple heuristic: Extract lines with uppercase or title-case words (like headings)
|
148 |
+
headings = [line for line in text.split("\n") if line.strip().istitle()]
|
149 |
+
return ', '.join(headings[:5]) # Return the first 5 headings
|
150 |
+
except Exception as e:
|
151 |
+
logger.error(f"Error extracting headings: {e}")
|
152 |
+
return "No headings found"
|
153 |
+
|
154 |
def clear_memory(self):
|
155 |
self.memory.clear()
|
156 |
|
|
|
202 |
clear_button.click(clear_chatbot, outputs=[chatbot_interface])
|
203 |
|
204 |
if __name__ == "__main__":
|
205 |
+
demo.launch()
|