Spaces:

syedmudassir16 commited on
Commit
d17b42f
1 Parent(s): eabeb6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -157
app.py CHANGED
@@ -1,5 +1,3 @@
1
- "Single Thread"
2
-
3
  import os
4
  import multiprocessing
5
  import concurrent.futures
@@ -10,18 +8,19 @@ from sentence_transformers import SentenceTransformer
10
  import faiss
11
  import torch
12
  import numpy as np
13
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
14
  from datetime import datetime
15
  import json
16
  import gradio as gr
17
- import re
 
18
 
19
  class DocumentRetrievalAndGeneration:
20
  def __init__(self, embedding_model_name, lm_model_id, data_folder):
21
  self.all_splits = self.load_documents(data_folder)
22
  self.embeddings = SentenceTransformer(embedding_model_name)
23
  self.gpu_index = self.create_faiss_index()
24
- self.llm = self.initialize_llm(lm_model_id)
25
 
26
  def load_documents(self, folder_path):
27
  loader = DirectoryLoader(folder_path, loader_cls=TextLoader)
@@ -30,7 +29,7 @@ class DocumentRetrievalAndGeneration:
30
  all_splits = text_splitter.split_documents(documents)
31
  print('Length of documents:', len(documents))
32
  print("LEN of all_splits", len(all_splits))
33
- for i in range(5):
34
  print(all_splits[i].page_content)
35
  return all_splits
36
 
@@ -44,122 +43,99 @@ class DocumentRetrievalAndGeneration:
44
  return gpu_index
45
 
46
  def initialize_llm(self, model_id):
47
- bnb_config = BitsAndBytesConfig(
48
  load_in_4bit=True,
49
  bnb_4bit_use_double_quant=True,
50
  bnb_4bit_quant_type="nf4",
51
  bnb_4bit_compute_dtype=torch.bfloat16
52
  )
53
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
54
- model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
55
  tokenizer = AutoTokenizer.from_pretrained(model_id)
56
- generate_text = pipeline(
57
- model=model,
58
- tokenizer=tokenizer,
59
- return_full_text=True,
60
- task='text-generation',
61
- temperature=0.6,
62
- max_new_tokens=256,
63
  )
64
- return generate_text
65
 
66
- def generate_response_with_timeout(self, model_inputs):
67
  try:
68
- with concurrent.futures.ThreadPoolExecutor() as executor:
69
- future = executor.submit(self.llm.model.generate, model_inputs, max_new_tokens=1000, do_sample=True)
70
- generated_ids = future.result(timeout=60) # Timeout set to 60 seconds
71
- return generated_ids
72
- except concurrent.futures.TimeoutError:
73
- return "Text generation process timed out"
74
- raise TimeoutError("Text generation process timed out")
75
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  def query_and_generate_response(self, query):
 
77
  query_embedding = self.embeddings.encode(query, convert_to_tensor=True).cpu().numpy()
78
- distances, indices = self.gpu_index.search(np.array([query_embedding]), k=5)
79
-
80
  content = ""
 
81
  for idx, distance in zip(indices[0], distances[0]):
 
 
 
 
82
  content += "-" * 50 + "\n"
83
  content += self.all_splits[idx].page_content + "\n"
84
  print("CHUNK", idx)
85
  print("Distance:", distance)
 
86
  print(self.all_splits[idx].page_content)
87
  print("############################")
88
- # for idx in indices[0]:
89
- # content += "-" * 50 + "\n"
90
- # content += self.all_splits[idx].page_content + "\n"
91
- # distance=distances[0][i]
92
- # print("CHUNK", idx)
93
- # print("Distance :",distance)
94
- # print(self.all_splits[idx].page_content)
95
- # print("############################")
96
- prompt = f"""<s>
97
- You are a knowledgeable assistant with access to a comprehensive database.
98
- I need you to answer my question and provide related information in a specific format.
99
- I have provided five relatable json files {content}, choose the most suitable chunks for answering the query
100
- RETURN ONLY SOLUTION . IF THEIR IS NO ANSWER RELATABLE IN RETRIEVED CHUNKS , RETURN " NO SOLUTION AVAILABLE"
101
- Here's what I need:
102
- Include a final answer without additional comments, sign-offs, or extra phrases. Be direct and to the point.
103
- content
104
-
105
- Here's my question:
106
- Query:{query}
107
- Solution==>
108
-
109
-
110
- Example
111
- Query: "How to use IPU1_0 instead of A15_0 to process NDK in TDA2x-EVM",
112
- Solution: "To use IPU1_0 instead of A15_0 to process NDK in TDA2x-EVM, you need to modify the configuration file of the NDK application. Specifically, change the processor reference from 'A15_0' to 'IPU1_0'.",
113
-
114
- Example
115
- Query: "Can BQ25896 support I2C interface?",
116
- Solution: "Yes, the BQ25896 charger supports the I2C interface for communication."
117
-
118
- Example
119
- Query: "Who is the fastest runner in the world",
120
- Solution:"NO SOLUTION AVAILABLE"
121
 
122
- Example
123
- Query:"What is the price of latest apple MACBOOK "
124
- Solution:"NO SOLUTION AVAILABLE"
125
-
126
- RETURN ONLY SOLUTION . IF THEIR IS NO ANSWER RELATABLE IN RETRIEVED CHUNKS , RETURN " NO SOLUTION AVAILABLE"
127
- IF THE QUERY AND THE RETRIEVED CHUNKS DO NOT CORRELATE MEANINGFULLY, OR IF THE QUERY IS NOT RELEVANT TO TDA2 OR RELATED TOPICS, THEN "NO SOLUTION AVAILABLE."
128
- </s>
129
- """
130
- # prompt = f"Query: {query}\nSolution: {content}\n"
 
 
 
 
131
 
132
- # Encode and prepare inputs
133
- messages = [{"role": "user", "content": prompt}]
134
- encodeds = self.llm.tokenizer.apply_chat_template(messages, return_tensors="pt")
135
- model_inputs = encodeds.to(self.llm.device)
136
 
137
- # Perform inference and measure time
138
  start_time = datetime.now()
139
- generated_ids = self.generate_response_with_timeout(model_inputs)
140
- # generated_ids = self.llm.model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
141
  elapsed_time = datetime.now() - start_time
142
 
143
- # Decode and return output
144
- decoded = self.llm.tokenizer.batch_decode(generated_ids)
145
- generated_response = decoded[0]
146
- match1 = re.search(r'\[/INST\](.*?)</s>', generated_response, re.DOTALL)
147
-
148
- match2 = re.search(r'Solution:(.*?)</s>', generated_response, re.DOTALL | re.IGNORECASE)
149
- if match1:
150
- solution_text = match1.group(1).strip()
151
- print(solution_text)
152
- if "Solution:" in solution_text:
153
- solution_text = solution_text.split("Solution:", 1)[1].strip()
154
- elif match2:
155
- solution_text = match2.group(1).strip()
156
- print(solution_text)
157
-
158
- else:
159
- solution_text=generated_response
160
  print("Generated response:", generated_response)
161
  print("Time elapsed:", elapsed_time)
162
- print("Device in use:", self.llm.device)
 
 
 
 
 
 
 
 
163
 
164
  return solution_text, content
165
 
@@ -168,29 +144,25 @@ class DocumentRetrievalAndGeneration:
168
  return response
169
 
170
  if __name__ == "__main__":
171
- # Example usage
172
  embedding_model_name = 'flax-sentence-embeddings/all_datasets_v3_MiniLM-L12'
173
- lm_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
174
  data_folder = 'sample_embedding_folder2'
175
 
176
  doc_retrieval_gen = DocumentRetrievalAndGeneration(embedding_model_name, lm_model_id, data_folder)
177
 
178
- """Dual Interface"""
179
-
180
  def launch_interface():
181
  css_code = """
182
  .gradio-container {
183
  background-color: #daccdb;
184
  }
185
- /* Button styling for all buttons */
186
  button {
187
- background-color: #927fc7; /* Default color for all other buttons */
188
  color: black;
189
  border: 1px solid black;
190
  padding: 10px;
191
  margin-right: 10px;
192
- font-size: 16px; /* Increase font size */
193
- font-weight: bold; /* Make text bold */
194
  }
195
  """
196
  EXAMPLES = [
@@ -201,20 +173,18 @@ if __name__ == "__main__":
201
 
202
  file_path = "ticketNames.txt"
203
 
204
- # Read the file content
205
  with open(file_path, "r") as file:
206
  content = file.read()
207
  ticket_names = json.loads(content)
208
  dropdown = gr.Dropdown(label="Sample queries", choices=ticket_names)
209
 
210
- # Define Gradio interfaces
211
  tab1 = gr.Interface(
212
  fn=doc_retrieval_gen.qa_infer_gradio,
213
  inputs=[gr.Textbox(label="QUERY", placeholder="Enter your query here")],
214
  allow_flagging='never',
215
  examples=EXAMPLES,
216
  cache_examples=False,
217
- outputs=[gr.Textbox(label="SOLUTION"), gr.Textbox(label="RELATED QUERIES")],
218
  css=css_code
219
  )
220
  tab2 = gr.Interface(
@@ -225,7 +195,6 @@ if __name__ == "__main__":
225
  css=css_code
226
  )
227
 
228
- # Combine interfaces into a tabbed interface
229
  gr.TabbedInterface(
230
  [tab1, tab2],
231
  ["Textbox Input", "FAQs"],
@@ -233,53 +202,4 @@ if __name__ == "__main__":
233
  css=css_code
234
  ).launch(debug=True)
235
 
236
- # Launch the interface
237
- launch_interface()
238
-
239
-
240
-
241
- """Single Interface"""
242
- # def launch_interface():
243
- # css_code = """
244
- # .gradio-container {
245
- # background-color: #daccdb;
246
- # }
247
- # /* Button styling for all buttons */
248
- # button {
249
- # background-color: #927fc7; /* Default color for all other buttons */
250
- # color: black;
251
- # border: 1px solid black;
252
- # padding: 10px;
253
- # margin-right: 10px;
254
- # font-size: 16px; /* Increase font size */
255
- # font-weight: bold; /* Make text bold */
256
- # }
257
- # """
258
- # EXAMPLES = ["On which devices can the VIP and CSI2 modules operate simultaneously? ",
259
- # "I'm using Code Composer Studio 5.4.0.00091 and enabled FPv4SPD16 floating point support for CortexM4 in TDA2. However, after building the project, the .asm file shows --float_support=vfplib instead of FPv4SPD16. Why is this happening?",
260
- # "Could you clarify the maximum number of cameras that can be connected simultaneously to the video input ports on the TDA2x SoC, considering it supports up to 10 multiplexed input ports and includes 3 dedicated video input modules?"]
261
-
262
- # file_path = "ticketNames.txt"
263
-
264
- # # Read the file content
265
- # with open(file_path, "r") as file:
266
- # content = file.read()
267
- # ticket_names = json.loads(content)
268
- # dropdown = gr.Dropdown(label="Sample queries", choices=ticket_names)
269
-
270
- # # Define Gradio interface
271
- # interface = gr.Interface(
272
- # fn=doc_retrieval_gen.qa_infer_gradio,
273
- # inputs=[gr.Textbox(label="QUERY", placeholder="Enter your query here")],
274
- # allow_flagging='never',
275
- # examples=EXAMPLES,
276
- # cache_examples=False,
277
- # outputs=[gr.Textbox(label="SOLUTION"), gr.Textbox(label="RELATED QUERIES")],
278
- # css=css_code
279
- # )
280
-
281
- # # Launch Gradio interface
282
- # interface.launch(debug=True)
283
-
284
- # # Launch the interface
285
- # launch_interface()
 
 
 
1
  import os
2
  import multiprocessing
3
  import concurrent.futures
 
8
  import faiss
9
  import torch
10
  import numpy as np
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
12
  from datetime import datetime
13
  import json
14
  import gradio as gr
15
+ import re
16
+ from threading import Thread
17
 
18
  class DocumentRetrievalAndGeneration:
19
  def __init__(self, embedding_model_name, lm_model_id, data_folder):
20
  self.all_splits = self.load_documents(data_folder)
21
  self.embeddings = SentenceTransformer(embedding_model_name)
22
  self.gpu_index = self.create_faiss_index()
23
+ self.tokenizer, self.model = self.initialize_llm(lm_model_id)
24
 
25
  def load_documents(self, folder_path):
26
  loader = DirectoryLoader(folder_path, loader_cls=TextLoader)
 
29
  all_splits = text_splitter.split_documents(documents)
30
  print('Length of documents:', len(documents))
31
  print("LEN of all_splits", len(all_splits))
32
+ for i in range(3):
33
  print(all_splits[i].page_content)
34
  return all_splits
35
 
 
43
  return gpu_index
44
 
45
  def initialize_llm(self, model_id):
46
+ quantization_config = BitsAndBytesConfig(
47
  load_in_4bit=True,
48
  bnb_4bit_use_double_quant=True,
49
  bnb_4bit_quant_type="nf4",
50
  bnb_4bit_compute_dtype=torch.bfloat16
51
  )
 
 
52
  tokenizer = AutoTokenizer.from_pretrained(model_id)
53
+ model = AutoModelForCausalLM.from_pretrained(
54
+ model_id,
55
+ torch_dtype=torch.bfloat16,
56
+ device_map="auto",
57
+ quantization_config=quantization_config
 
 
58
  )
59
+ return tokenizer, model
60
 
61
+ def generate_response_with_timeout(self, input_ids, max_new_tokens=1000):
62
  try:
63
+ streamer = TextIteratorStreamer(self.tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
64
+ generate_kwargs = dict(
65
+ input_ids=input_ids,
66
+ max_new_tokens=max_new_tokens,
67
+ do_sample=True,
68
+ top_p=1.0,
69
+ top_k=20,
70
+ temperature=0.8,
71
+ repetition_penalty=1.2,
72
+ eos_token_id=[128001, 128008, 128009],
73
+ streamer=streamer,
74
+ )
75
+
76
+ thread = Thread(target=self.model.generate, kwargs=generate_kwargs)
77
+ thread.start()
78
+
79
+ generated_text = ""
80
+ for new_text in streamer:
81
+ generated_text += new_text
82
+
83
+ return generated_text
84
+ except Exception as e:
85
+ print(f"Error in generate_response_with_timeout: {str(e)}")
86
+ return "Text generation process encountered an error"
87
+
88
  def query_and_generate_response(self, query):
89
+ similarityThreshold = 1
90
  query_embedding = self.embeddings.encode(query, convert_to_tensor=True).cpu().numpy()
91
+ distances, indices = self.gpu_index.search(np.array([query_embedding]), k=3)
92
+ print("Distance", distances, "indices", indices)
93
  content = ""
94
+ filtered_results = []
95
  for idx, distance in zip(indices[0], distances[0]):
96
+ if distance <= similarityThreshold:
97
+ filtered_results.append(idx)
98
+ for i in filtered_results:
99
+ print(self.all_splits[i].page_content)
100
  content += "-" * 50 + "\n"
101
  content += self.all_splits[idx].page_content + "\n"
102
  print("CHUNK", idx)
103
  print("Distance:", distance)
104
+ print("indices:", indices)
105
  print(self.all_splits[idx].page_content)
106
  print("############################")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ conversation = [
109
+ {"role": "system", "content": "You are a knowledgeable assistant with access to a comprehensive database."},
110
+ {"role": "user", "content": f"""
111
+ I need you to answer my question and provide related information in a specific format.
112
+ I have provided five relatable json files {content}, choose the most suitable chunks for answering the query.
113
+ RETURN ONLY SOLUTION. IF THERE IS NO ANSWER RELATABLE IN RETRIEVED CHUNKS, RETURN "NO SOLUTION AVAILABLE".
114
+ Include a final answer without additional comments, sign-offs, or extra phrases. Be direct and to the point.
115
+
116
+ Here's my question:
117
+ Query: {query}
118
+ Solution==>
119
+ """}
120
+ ]
121
 
122
+ input_ids = self.tokenizer.apply_chat_template(conversation, return_tensors="pt").to(self.model.device)
 
 
 
123
 
 
124
  start_time = datetime.now()
125
+ generated_response = self.generate_response_with_timeout(input_ids)
 
126
  elapsed_time = datetime.now() - start_time
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  print("Generated response:", generated_response)
129
  print("Time elapsed:", elapsed_time)
130
+ print("Device in use:", self.model.device)
131
+
132
+ solution_text = generated_response.strip()
133
+ if "Solution:" in solution_text:
134
+ solution_text = solution_text.split("Solution:", 1)[1].strip()
135
+
136
+ # Post-processing to remove "assistant" prefix
137
+ solution_text = re.sub(r'^assistant\s*', '', solution_text, flags=re.IGNORECASE)
138
+ solution_text = solution_text.strip()
139
 
140
  return solution_text, content
141
 
 
144
  return response
145
 
146
  if __name__ == "__main__":
 
147
  embedding_model_name = 'flax-sentence-embeddings/all_datasets_v3_MiniLM-L12'
148
+ lm_model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
149
  data_folder = 'sample_embedding_folder2'
150
 
151
  doc_retrieval_gen = DocumentRetrievalAndGeneration(embedding_model_name, lm_model_id, data_folder)
152
 
 
 
153
  def launch_interface():
154
  css_code = """
155
  .gradio-container {
156
  background-color: #daccdb;
157
  }
 
158
  button {
159
+ background-color: #927fc7;
160
  color: black;
161
  border: 1px solid black;
162
  padding: 10px;
163
  margin-right: 10px;
164
+ font-size: 16px;
165
+ font-weight: bold;
166
  }
167
  """
168
  EXAMPLES = [
 
173
 
174
  file_path = "ticketNames.txt"
175
 
 
176
  with open(file_path, "r") as file:
177
  content = file.read()
178
  ticket_names = json.loads(content)
179
  dropdown = gr.Dropdown(label="Sample queries", choices=ticket_names)
180
 
 
181
  tab1 = gr.Interface(
182
  fn=doc_retrieval_gen.qa_infer_gradio,
183
  inputs=[gr.Textbox(label="QUERY", placeholder="Enter your query here")],
184
  allow_flagging='never',
185
  examples=EXAMPLES,
186
  cache_examples=False,
187
+ outputs=[gr.Textbox(label="RESPONSE"), gr.Textbox(label="RELATED QUERIES")],
188
  css=css_code
189
  )
190
  tab2 = gr.Interface(
 
195
  css=css_code
196
  )
197
 
 
198
  gr.TabbedInterface(
199
  [tab1, tab2],
200
  ["Textbox Input", "FAQs"],
 
202
  css=css_code
203
  ).launch(debug=True)
204
 
205
+ launch_interface()