KingNish commited on
Commit
1339cff
·
verified ·
1 Parent(s): cdf9da1

test update

Browse files
Files changed (1) hide show
  1. chatbot.py +33 -17
chatbot.py CHANGED
@@ -198,8 +198,6 @@ client_mixtral = InferenceClient("NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO")
198
  client_mistral = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
199
  generate_kwargs = dict( max_new_tokens=4000, do_sample=True, stream=True, details=True, return_full_text=False )
200
 
201
- system_llava = "<|im_start|>system\nYou are OpenGPT 4o, an exceptionally capable and versatile AI assistant made by KingNish. Your task is to fulfill users query in best possible way. You are provided with image, videos and 3d structures as input with question your task is to give best possible detailed results to user according to their query. Reply the question asked by user properly and best possible way.<|im_end|>"
202
-
203
  @spaces.GPU(duration=60, queue=False)
204
  def model_inference( user_prompt, chat_history, web_search):
205
  if not user_prompt["files"]:
@@ -244,28 +242,46 @@ def model_inference( user_prompt, chat_history, web_search):
244
  output += response.token.text
245
  yield output
246
  else:
247
- image = user_prompt["files"][-1]
 
 
 
 
248
 
249
- txt = user_prompt["text"]
250
- img = user_prompt["files"]
251
- ext_buffer =f"'user\ntext': '{txt}', 'files': '{img}' assistant"
252
 
253
  video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
254
  image_extensions = Image.registered_extensions()
255
  image_extensions = tuple([ex for ex, f in image_extensions.items()])
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- if image.endswith(video_extensions):
258
- image = sample_frames(image)
259
- print(len(image))
260
- image_tokens = "<image>" * int(len(image))
261
- prompt = f"<|im_start|>user {image_tokens}\n{user_prompt}<|im_end|><|im_start|>assistant"
262
-
263
- elif image.endswith(image_extensions):
264
- image = Image.open(image).convert("RGB")
265
- prompt = f"<|im_start|>user <image>\n{user_prompt}<|im_end|><|im_start|>assistant"
266
-
267
- final_prompt = f"{system_llava}\n{prompt}"
268
 
 
 
 
 
 
 
 
 
 
 
269
  inputs = processor(prompt, image, return_tensors="pt").to("cuda", torch.float16)
270
  streamer = TextIteratorStreamer(processor, skip_prompt=True, **{"skip_special_tokens": True})
271
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
 
198
  client_mistral = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
199
  generate_kwargs = dict( max_new_tokens=4000, do_sample=True, stream=True, details=True, return_full_text=False )
200
 
 
 
201
  @spaces.GPU(duration=60, queue=False)
202
  def model_inference( user_prompt, chat_history, web_search):
203
  if not user_prompt["files"]:
 
242
  output += response.token.text
243
  yield output
244
  else:
245
+ message = user_prompt
246
+ if len(message.files) == 1:
247
+ image = [message.files[0].path]
248
+ elif len(message.files) > 1:
249
+ image = [msg.path for msg in message.files]
250
 
251
+ txt = message.text
 
 
252
 
253
  video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
254
  image_extensions = Image.registered_extensions()
255
  image_extensions = tuple([ex for ex, f in image_extensions.items()])
256
+
257
+ if len(image) == 1:
258
+ if image.endswith(video_extensions):
259
+ image = sample_frames(image)
260
+ print(len(image))
261
+ image_tokens = "<image>" * int(len(image))
262
+ prompt = f"<|im_start|>user {image_tokens}\n{message.text}<|im_end|><|im_start|>assistant"
263
+ elif image.endswith(image_extensions):
264
+ image = Image.open(image).convert("RGB")
265
+ prompt = f"<|im_start|>user <image>\n{message.text}<|im_end|><|im_start|>assistant"
266
+
267
+ elif len(image) > 1:
268
+ image_list = []
269
 
270
+ for img in image:
271
+ if img.endswith(image_extensions):
272
+ img = Image.open(img).convert("RGB")
273
+ image_list.append(img)
 
 
 
 
 
 
 
274
 
275
+ elif img.endswith(video_extensions):
276
+ frames = sample_frames(img)
277
+ for frame in frames:
278
+ image_list.append(frame)
279
+
280
+ toks = "<image>" * len(image_list)
281
+ prompt = f"<|im_start|>user {toks}\n{message.text}<|im_end|><|im_start|>assistant"
282
+ image = image_list
283
+
284
+ prompt = f"<|im_start|>system\nYou are OpenGPT 4o, an exceptionally capable and versatile AI assistant made by KingNish. Your task is to fulfill users query in best possible way. You are provided with image, videos and 3d structures as input with question your task is to give best possible detailed results to user according to their query. Reply the question asked by user properly and best possible way.<|im_end|>\n{prompt}"
285
  inputs = processor(prompt, image, return_tensors="pt").to("cuda", torch.float16)
286
  streamer = TextIteratorStreamer(processor, skip_prompt=True, **{"skip_special_tokens": True})
287
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)