Spaces:
Runtime error
Runtime error
new code
Browse files- src/app.py +0 -5
- src/pdfchatbot.py +21 -123
src/app.py
CHANGED
@@ -1,19 +1,14 @@
|
|
1 |
from interface import create_demo
|
2 |
from pdfchatbot import PDFChatBot
|
3 |
|
4 |
-
# Create Gradio interface
|
5 |
demo, chat_history, show_img, txt, submit_button, uploaded_pdf, slider1 = create_demo()
|
6 |
|
7 |
-
# Create PDFChatBot instance
|
8 |
pdf_chatbot = PDFChatBot()
|
9 |
pdf_chatbot.create_organic_pipeline()
|
10 |
|
11 |
-
# Set up event handlers
|
12 |
with demo:
|
13 |
-
# Event handler for uploading a PDF
|
14 |
uploaded_pdf.upload(pdf_chatbot.render_file, inputs=[uploaded_pdf,slider1], outputs=[show_img])
|
15 |
|
16 |
-
# Event handler for submitting text and generating response
|
17 |
submit_button.click(pdf_chatbot.add_text, inputs=[chat_history, txt], outputs=[chat_history], queue=False).\
|
18 |
success(pdf_chatbot.generate_response, inputs=[chat_history, txt, uploaded_pdf], outputs=[chat_history,txt]).\
|
19 |
success(pdf_chatbot.render_file, inputs=[uploaded_pdf], outputs=[show_img])
|
|
|
1 |
from interface import create_demo
|
2 |
from pdfchatbot import PDFChatBot
|
3 |
|
|
|
4 |
demo, chat_history, show_img, txt, submit_button, uploaded_pdf, slider1 = create_demo()
|
5 |
|
|
|
6 |
pdf_chatbot = PDFChatBot()
|
7 |
pdf_chatbot.create_organic_pipeline()
|
8 |
|
|
|
9 |
with demo:
|
|
|
10 |
uploaded_pdf.upload(pdf_chatbot.render_file, inputs=[uploaded_pdf,slider1], outputs=[show_img])
|
11 |
|
|
|
12 |
submit_button.click(pdf_chatbot.add_text, inputs=[chat_history, txt], outputs=[chat_history], queue=False).\
|
13 |
success(pdf_chatbot.generate_response, inputs=[chat_history, txt, uploaded_pdf], outputs=[chat_history,txt]).\
|
14 |
success(pdf_chatbot.render_file, inputs=[uploaded_pdf], outputs=[show_img])
|
src/pdfchatbot.py
CHANGED
@@ -40,62 +40,11 @@ class PDFChatBot:
|
|
40 |
self.format_seperator="""\n\n--\n\n"""
|
41 |
#self.chunk_size_slider = chunk_size_slider
|
42 |
|
43 |
-
def load_config(self, file_path):
|
44 |
-
"""
|
45 |
-
Load configuration from a YAML file.
|
46 |
-
|
47 |
-
Parameters:
|
48 |
-
file_path (str): Path to the YAML configuration file.
|
49 |
-
|
50 |
-
Returns:
|
51 |
-
dict: Configuration as a dictionary.
|
52 |
-
"""
|
53 |
-
with open(file_path, 'r') as stream:
|
54 |
-
try:
|
55 |
-
config = yaml.safe_load(stream)
|
56 |
-
return config
|
57 |
-
except yaml.YAMLError as exc:
|
58 |
-
print(f"Error loading configuration: {exc}")
|
59 |
-
return None
|
60 |
-
|
61 |
-
def add_text(self, history, text):
|
62 |
-
"""
|
63 |
-
Add user-entered text to the chat history.
|
64 |
-
|
65 |
-
Parameters:
|
66 |
-
history (list): List of chat history tuples.
|
67 |
-
text (str): User-entered text.
|
68 |
-
|
69 |
-
Returns:
|
70 |
-
list: Updated chat history.
|
71 |
-
"""
|
72 |
-
if not text:
|
73 |
-
raise gr.Error('Enter text')
|
74 |
-
history.append((text, ''))
|
75 |
-
#self.chunk_size_slider.interactive = False
|
76 |
-
return history
|
77 |
-
|
78 |
-
def create_prompt_template(self):
|
79 |
-
"""
|
80 |
-
Create a prompt template for the chatbot.
|
81 |
-
"""
|
82 |
-
template = (
|
83 |
-
f"The assistant should provide detailed explanations."
|
84 |
-
"Combine the chat history and follow up question into "
|
85 |
-
"Follow up question: What is this"
|
86 |
-
)
|
87 |
-
self.prompt = PromptTemplate.from_template(template)
|
88 |
-
|
89 |
def load_embeddings(self):
|
90 |
-
""
|
91 |
-
|
92 |
-
"""
|
93 |
-
self.embeddings = HuggingFaceEmbeddings(model_name=self.config.get("modelEmbeddings"))
|
94 |
|
95 |
def load_vectordb(self):
|
96 |
-
"""
|
97 |
-
Load the vector database from the documents and embeddings.
|
98 |
-
"""
|
99 |
text_splitter = RecursiveCharacterTextSplitter(
|
100 |
chunk_size=256,
|
101 |
chunk_overlap=100,
|
@@ -104,49 +53,23 @@ class PDFChatBot:
|
|
104 |
)
|
105 |
docs = text_splitter.split_documents(self.documents)
|
106 |
self.vectordb = Chroma.from_documents(docs, self.embeddings)
|
107 |
-
|
108 |
-
def load_tokenizer(self):
|
109 |
-
"""
|
110 |
-
Load the tokenizer from Hugging Face and set in the config file.
|
111 |
-
"""
|
112 |
-
self.tokenizer = AutoTokenizer.from_pretrained(self.config.get("autoTokenizer"))
|
113 |
-
|
114 |
-
def load_model(self):
|
115 |
-
"""
|
116 |
-
Load the causal language model from Hugging Face and set in the config file.
|
117 |
-
"""
|
118 |
-
self.model = AutoModelForCausalLM.from_pretrained(
|
119 |
-
self.config.get("autoModelForCausalLM"),
|
120 |
-
device_map='auto',
|
121 |
-
torch_dtype=torch.float16,
|
122 |
-
token=True,
|
123 |
-
)
|
124 |
-
|
125 |
-
def create_pipeline(self):
|
126 |
-
"""
|
127 |
-
Create a pipeline for text generation using the loaded model and tokenizer.
|
128 |
-
"""
|
129 |
-
pipe = pipeline(
|
130 |
-
model=self.model,
|
131 |
-
task='text-generation',
|
132 |
-
tokenizer=self.tokenizer,
|
133 |
-
max_new_tokens=1024
|
134 |
-
)
|
135 |
-
self.pipeline = HuggingFacePipeline(pipeline=pipe)
|
136 |
|
137 |
def create_organic_pipeline(self):
|
138 |
self.pipeline = pipeline(
|
139 |
"text-generation",
|
140 |
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
141 |
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
|
142 |
-
model_kwargs={"torch_dtype": torch.
|
143 |
device="cuda",
|
144 |
)
|
|
|
145 |
|
146 |
def get_organic_context(self, query):
|
147 |
documents = self.vectordb.similarity_search_with_relevance_scores(query, k=3)
|
148 |
context = self.format_seperator.join([doc.page_content for doc, score in documents])
|
149 |
self.current_context = context
|
|
|
150 |
print(self.current_context)
|
151 |
|
152 |
def create_organic_response(self, history, query):
|
@@ -175,17 +98,6 @@ class PDFChatBot:
|
|
175 |
)
|
176 |
return outputs[0]["generated_text"][len(prompt):]
|
177 |
|
178 |
-
def create_chain(self):
|
179 |
-
"""
|
180 |
-
Create a Conversational Retrieval Chain
|
181 |
-
"""
|
182 |
-
self.chain = ConversationalRetrievalChain.from_llm(
|
183 |
-
self.pipeline,
|
184 |
-
chain_type="stuff",
|
185 |
-
retriever=self.vectordb.as_retriever(search_kwargs={"k": 1}),
|
186 |
-
condense_question_prompt=self.prompt,
|
187 |
-
return_source_documents=True
|
188 |
-
)
|
189 |
|
190 |
def process_file(self, file):
|
191 |
"""
|
@@ -194,24 +106,13 @@ class PDFChatBot:
|
|
194 |
Parameters:
|
195 |
file (FileStorage): The uploaded PDF file.
|
196 |
"""
|
197 |
-
self.create_prompt_template()
|
198 |
self.documents = PyPDFLoader(file.name).load()
|
199 |
self.load_embeddings()
|
200 |
self.load_vectordb()
|
201 |
#self.create_chain()
|
202 |
@spaces.GPU
|
203 |
def generate_response(self, history, query, file):
|
204 |
-
"""
|
205 |
-
Generate a response based on user query and chat history.
|
206 |
|
207 |
-
Parameters:
|
208 |
-
history (list): List of chat history tuples.
|
209 |
-
query (str): User's query.
|
210 |
-
file (FileStorage): The uploaded PDF file.
|
211 |
-
|
212 |
-
Returns:
|
213 |
-
tuple: Updated chat history and a space.
|
214 |
-
"""
|
215 |
if not query:
|
216 |
raise gr.Error(message='Submit a question')
|
217 |
if not file:
|
@@ -220,31 +121,28 @@ class PDFChatBot:
|
|
220 |
self.process_file(file)
|
221 |
self.processed = True
|
222 |
|
223 |
-
"""result = self.chain({"question": query, 'chat_history': self.chat_history}, return_only_outputs=True)
|
224 |
-
self.chat_history.append((query, result["answer"]))
|
225 |
-
for char in result['answer']:
|
226 |
-
history[-1][-1] += char
|
227 |
-
return history, " """""
|
228 |
-
|
229 |
result = self.create_organic_response(history="",query=query)
|
230 |
return result,""
|
231 |
|
232 |
def render_file(self, file,chunk_size):
|
233 |
-
"""
|
234 |
-
Renders a specific page of a PDF file as an image.
|
235 |
-
|
236 |
-
Parameters:
|
237 |
-
file (FileStorage): The PDF file.
|
238 |
-
|
239 |
-
Returns:
|
240 |
-
PIL.Image.Image: The rendered page as an image.
|
241 |
-
"""
|
242 |
-
#if self.chunk_size_slider.interactive == False:
|
243 |
-
# self.chunk_size_slider.interactive = True
|
244 |
print(chunk_size)
|
245 |
doc = fitz.open(file.name)
|
246 |
page = doc[self.page]
|
247 |
self.chunk_size = chunk_size
|
248 |
pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
|
249 |
image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
|
250 |
-
return image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
self.format_seperator="""\n\n--\n\n"""
|
41 |
#self.chunk_size_slider = chunk_size_slider
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
def load_embeddings(self):
|
44 |
+
self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
45 |
+
print("Embedding model loaded")
|
|
|
|
|
46 |
|
47 |
def load_vectordb(self):
|
|
|
|
|
|
|
48 |
text_splitter = RecursiveCharacterTextSplitter(
|
49 |
chunk_size=256,
|
50 |
chunk_overlap=100,
|
|
|
53 |
)
|
54 |
docs = text_splitter.split_documents(self.documents)
|
55 |
self.vectordb = Chroma.from_documents(docs, self.embeddings)
|
56 |
+
print("Vector store created")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
def create_organic_pipeline(self):
|
59 |
self.pipeline = pipeline(
|
60 |
"text-generation",
|
61 |
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
62 |
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
|
63 |
+
model_kwargs={"torch_dtype": torch.float16},
|
64 |
device="cuda",
|
65 |
)
|
66 |
+
print("Model pipeline loaded")
|
67 |
|
68 |
def get_organic_context(self, query):
|
69 |
documents = self.vectordb.similarity_search_with_relevance_scores(query, k=3)
|
70 |
context = self.format_seperator.join([doc.page_content for doc, score in documents])
|
71 |
self.current_context = context
|
72 |
+
print("Context Ready")
|
73 |
print(self.current_context)
|
74 |
|
75 |
def create_organic_response(self, history, query):
|
|
|
98 |
)
|
99 |
return outputs[0]["generated_text"][len(prompt):]
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
def process_file(self, file):
|
103 |
"""
|
|
|
106 |
Parameters:
|
107 |
file (FileStorage): The uploaded PDF file.
|
108 |
"""
|
|
|
109 |
self.documents = PyPDFLoader(file.name).load()
|
110 |
self.load_embeddings()
|
111 |
self.load_vectordb()
|
112 |
#self.create_chain()
|
113 |
@spaces.GPU
|
114 |
def generate_response(self, history, query, file):
|
|
|
|
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
if not query:
|
117 |
raise gr.Error(message='Submit a question')
|
118 |
if not file:
|
|
|
121 |
self.process_file(file)
|
122 |
self.processed = True
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
result = self.create_organic_response(history="",query=query)
|
125 |
return result,""
|
126 |
|
127 |
def render_file(self, file,chunk_size):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
print(chunk_size)
|
129 |
doc = fitz.open(file.name)
|
130 |
page = doc[self.page]
|
131 |
self.chunk_size = chunk_size
|
132 |
pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
|
133 |
image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
|
134 |
+
return image
|
135 |
+
|
136 |
+
def add_text(self, history, text):
|
137 |
+
"""
|
138 |
+
Add user-entered text to the chat history.
|
139 |
+
Parameters:
|
140 |
+
history (list): List of chat history tuples.
|
141 |
+
text (str): User-entered text.
|
142 |
+
Returns:
|
143 |
+
list: Updated chat history.
|
144 |
+
"""
|
145 |
+
if not text:
|
146 |
+
raise gr.Error('Enter text')
|
147 |
+
history.append((text, ''))
|
148 |
+
return history
|