hanzla commited on
Commit
8489a3a
·
1 Parent(s): 9c18d13
Files changed (2) hide show
  1. src/app.py +0 -5
  2. src/pdfchatbot.py +21 -123
src/app.py CHANGED
@@ -1,19 +1,14 @@
1
  from interface import create_demo
2
  from pdfchatbot import PDFChatBot
3
 
4
- # Create Gradio interface
5
  demo, chat_history, show_img, txt, submit_button, uploaded_pdf, slider1 = create_demo()
6
 
7
- # Create PDFChatBot instance
8
  pdf_chatbot = PDFChatBot()
9
  pdf_chatbot.create_organic_pipeline()
10
 
11
- # Set up event handlers
12
  with demo:
13
- # Event handler for uploading a PDF
14
  uploaded_pdf.upload(pdf_chatbot.render_file, inputs=[uploaded_pdf,slider1], outputs=[show_img])
15
 
16
- # Event handler for submitting text and generating response
17
  submit_button.click(pdf_chatbot.add_text, inputs=[chat_history, txt], outputs=[chat_history], queue=False).\
18
  success(pdf_chatbot.generate_response, inputs=[chat_history, txt, uploaded_pdf], outputs=[chat_history,txt]).\
19
  success(pdf_chatbot.render_file, inputs=[uploaded_pdf], outputs=[show_img])
 
1
  from interface import create_demo
2
  from pdfchatbot import PDFChatBot
3
 
 
4
  demo, chat_history, show_img, txt, submit_button, uploaded_pdf, slider1 = create_demo()
5
 
 
6
  pdf_chatbot = PDFChatBot()
7
  pdf_chatbot.create_organic_pipeline()
8
 
 
9
  with demo:
 
10
  uploaded_pdf.upload(pdf_chatbot.render_file, inputs=[uploaded_pdf,slider1], outputs=[show_img])
11
 
 
12
  submit_button.click(pdf_chatbot.add_text, inputs=[chat_history, txt], outputs=[chat_history], queue=False).\
13
  success(pdf_chatbot.generate_response, inputs=[chat_history, txt, uploaded_pdf], outputs=[chat_history,txt]).\
14
  success(pdf_chatbot.render_file, inputs=[uploaded_pdf], outputs=[show_img])
src/pdfchatbot.py CHANGED
@@ -40,62 +40,11 @@ class PDFChatBot:
40
  self.format_seperator="""\n\n--\n\n"""
41
  #self.chunk_size_slider = chunk_size_slider
42
 
43
- def load_config(self, file_path):
44
- """
45
- Load configuration from a YAML file.
46
-
47
- Parameters:
48
- file_path (str): Path to the YAML configuration file.
49
-
50
- Returns:
51
- dict: Configuration as a dictionary.
52
- """
53
- with open(file_path, 'r') as stream:
54
- try:
55
- config = yaml.safe_load(stream)
56
- return config
57
- except yaml.YAMLError as exc:
58
- print(f"Error loading configuration: {exc}")
59
- return None
60
-
61
- def add_text(self, history, text):
62
- """
63
- Add user-entered text to the chat history.
64
-
65
- Parameters:
66
- history (list): List of chat history tuples.
67
- text (str): User-entered text.
68
-
69
- Returns:
70
- list: Updated chat history.
71
- """
72
- if not text:
73
- raise gr.Error('Enter text')
74
- history.append((text, ''))
75
- #self.chunk_size_slider.interactive = False
76
- return history
77
-
78
- def create_prompt_template(self):
79
- """
80
- Create a prompt template for the chatbot.
81
- """
82
- template = (
83
- f"The assistant should provide detailed explanations."
84
- "Combine the chat history and follow up question into "
85
- "Follow up question: What is this"
86
- )
87
- self.prompt = PromptTemplate.from_template(template)
88
-
89
  def load_embeddings(self):
90
- """
91
- Load embeddings from Hugging Face and set in the config file.
92
- """
93
- self.embeddings = HuggingFaceEmbeddings(model_name=self.config.get("modelEmbeddings"))
94
 
95
  def load_vectordb(self):
96
- """
97
- Load the vector database from the documents and embeddings.
98
- """
99
  text_splitter = RecursiveCharacterTextSplitter(
100
  chunk_size=256,
101
  chunk_overlap=100,
@@ -104,49 +53,23 @@ class PDFChatBot:
104
  )
105
  docs = text_splitter.split_documents(self.documents)
106
  self.vectordb = Chroma.from_documents(docs, self.embeddings)
107
-
108
- def load_tokenizer(self):
109
- """
110
- Load the tokenizer from Hugging Face and set in the config file.
111
- """
112
- self.tokenizer = AutoTokenizer.from_pretrained(self.config.get("autoTokenizer"))
113
-
114
- def load_model(self):
115
- """
116
- Load the causal language model from Hugging Face and set in the config file.
117
- """
118
- self.model = AutoModelForCausalLM.from_pretrained(
119
- self.config.get("autoModelForCausalLM"),
120
- device_map='auto',
121
- torch_dtype=torch.float16,
122
- token=True,
123
- )
124
-
125
- def create_pipeline(self):
126
- """
127
- Create a pipeline for text generation using the loaded model and tokenizer.
128
- """
129
- pipe = pipeline(
130
- model=self.model,
131
- task='text-generation',
132
- tokenizer=self.tokenizer,
133
- max_new_tokens=1024
134
- )
135
- self.pipeline = HuggingFacePipeline(pipeline=pipe)
136
 
137
  def create_organic_pipeline(self):
138
  self.pipeline = pipeline(
139
  "text-generation",
140
  model="meta-llama/Meta-Llama-3-8B-Instruct",
141
  tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
142
- model_kwargs={"torch_dtype": torch.bfloat16},
143
  device="cuda",
144
  )
 
145
 
146
  def get_organic_context(self, query):
147
  documents = self.vectordb.similarity_search_with_relevance_scores(query, k=3)
148
  context = self.format_seperator.join([doc.page_content for doc, score in documents])
149
  self.current_context = context
 
150
  print(self.current_context)
151
 
152
  def create_organic_response(self, history, query):
@@ -175,17 +98,6 @@ class PDFChatBot:
175
  )
176
  return outputs[0]["generated_text"][len(prompt):]
177
 
178
- def create_chain(self):
179
- """
180
- Create a Conversational Retrieval Chain
181
- """
182
- self.chain = ConversationalRetrievalChain.from_llm(
183
- self.pipeline,
184
- chain_type="stuff",
185
- retriever=self.vectordb.as_retriever(search_kwargs={"k": 1}),
186
- condense_question_prompt=self.prompt,
187
- return_source_documents=True
188
- )
189
 
190
  def process_file(self, file):
191
  """
@@ -194,24 +106,13 @@ class PDFChatBot:
194
  Parameters:
195
  file (FileStorage): The uploaded PDF file.
196
  """
197
- self.create_prompt_template()
198
  self.documents = PyPDFLoader(file.name).load()
199
  self.load_embeddings()
200
  self.load_vectordb()
201
  #self.create_chain()
202
  @spaces.GPU
203
  def generate_response(self, history, query, file):
204
- """
205
- Generate a response based on user query and chat history.
206
 
207
- Parameters:
208
- history (list): List of chat history tuples.
209
- query (str): User's query.
210
- file (FileStorage): The uploaded PDF file.
211
-
212
- Returns:
213
- tuple: Updated chat history and a space.
214
- """
215
  if not query:
216
  raise gr.Error(message='Submit a question')
217
  if not file:
@@ -220,31 +121,28 @@ class PDFChatBot:
220
  self.process_file(file)
221
  self.processed = True
222
 
223
- """result = self.chain({"question": query, 'chat_history': self.chat_history}, return_only_outputs=True)
224
- self.chat_history.append((query, result["answer"]))
225
- for char in result['answer']:
226
- history[-1][-1] += char
227
- return history, " """""
228
-
229
  result = self.create_organic_response(history="",query=query)
230
  return result,""
231
 
232
  def render_file(self, file,chunk_size):
233
- """
234
- Renders a specific page of a PDF file as an image.
235
-
236
- Parameters:
237
- file (FileStorage): The PDF file.
238
-
239
- Returns:
240
- PIL.Image.Image: The rendered page as an image.
241
- """
242
- #if self.chunk_size_slider.interactive == False:
243
- # self.chunk_size_slider.interactive = True
244
  print(chunk_size)
245
  doc = fitz.open(file.name)
246
  page = doc[self.page]
247
  self.chunk_size = chunk_size
248
  pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
249
  image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
250
- return image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  self.format_seperator="""\n\n--\n\n"""
41
  #self.chunk_size_slider = chunk_size_slider
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def load_embeddings(self):
44
+ self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
45
+ print("Embedding model loaded")
 
 
46
 
47
  def load_vectordb(self):
 
 
 
48
  text_splitter = RecursiveCharacterTextSplitter(
49
  chunk_size=256,
50
  chunk_overlap=100,
 
53
  )
54
  docs = text_splitter.split_documents(self.documents)
55
  self.vectordb = Chroma.from_documents(docs, self.embeddings)
56
+ print("Vector store created")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  def create_organic_pipeline(self):
59
  self.pipeline = pipeline(
60
  "text-generation",
61
  model="meta-llama/Meta-Llama-3-8B-Instruct",
62
  tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
63
+ model_kwargs={"torch_dtype": torch.float16},
64
  device="cuda",
65
  )
66
+ print("Model pipeline loaded")
67
 
68
  def get_organic_context(self, query):
69
  documents = self.vectordb.similarity_search_with_relevance_scores(query, k=3)
70
  context = self.format_seperator.join([doc.page_content for doc, score in documents])
71
  self.current_context = context
72
+ print("Context Ready")
73
  print(self.current_context)
74
 
75
  def create_organic_response(self, history, query):
 
98
  )
99
  return outputs[0]["generated_text"][len(prompt):]
100
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  def process_file(self, file):
103
  """
 
106
  Parameters:
107
  file (FileStorage): The uploaded PDF file.
108
  """
 
109
  self.documents = PyPDFLoader(file.name).load()
110
  self.load_embeddings()
111
  self.load_vectordb()
112
  #self.create_chain()
113
  @spaces.GPU
114
  def generate_response(self, history, query, file):
 
 
115
 
 
 
 
 
 
 
 
 
116
  if not query:
117
  raise gr.Error(message='Submit a question')
118
  if not file:
 
121
  self.process_file(file)
122
  self.processed = True
123
 
 
 
 
 
 
 
124
  result = self.create_organic_response(history="",query=query)
125
  return result,""
126
 
127
  def render_file(self, file,chunk_size):
 
 
 
 
 
 
 
 
 
 
 
128
  print(chunk_size)
129
  doc = fitz.open(file.name)
130
  page = doc[self.page]
131
  self.chunk_size = chunk_size
132
  pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
133
  image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
134
+ return image
135
+
136
+ def add_text(self, history, text):
137
+ """
138
+ Add user-entered text to the chat history.
139
+ Parameters:
140
+ history (list): List of chat history tuples.
141
+ text (str): User-entered text.
142
+ Returns:
143
+ list: Updated chat history.
144
+ """
145
+ if not text:
146
+ raise gr.Error('Enter text')
147
+ history.append((text, ''))
148
+ return history