prithivMLmods commited on
Commit
bd29f11
·
verified ·
1 Parent(s): bd5bebf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -6
app.py CHANGED
@@ -59,7 +59,6 @@ def clean_chat_history(chat_history):
59
  return cleaned
60
 
61
  # Environment variables and parameters for Stable Diffusion XL
62
- # Use : SG161222/RealVisXL_V4.0_Lightning or SG161222/RealVisXL_V5.0_Lightning
63
  MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # SDXL Model repository path via env variable
64
  MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
65
  USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
@@ -205,7 +204,7 @@ def generate(
205
  Generates chatbot responses with support for multimodal input and image generation.
206
  Special commands:
207
  - "@image": triggers image generation using the SDXL pipeline.
208
- - "@video-infer": triggers video processing using Qwen2VL.
209
  """
210
  text = input_dict["text"]
211
  files = input_dict.get("files", [])
@@ -280,7 +279,11 @@ def generate(
280
  yield buffer
281
  return
282
 
283
- # Process as text and/or image input.
 
 
 
 
284
  if files:
285
  if len(files) > 1:
286
  images = [load_image(image) for image in files]
@@ -309,8 +312,6 @@ def generate(
309
  time.sleep(0.01)
310
  yield buffer
311
  else:
312
- conversation = clean_chat_history(chat_history)
313
- conversation.append({"role": "user", "content": text})
314
  input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
315
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
316
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
@@ -358,7 +359,7 @@ demo = gr.ChatInterface(
358
  ],
359
  cache_examples=False,
360
  type="messages",
361
- description="# **Llama Edge** \n`@video-infer 'prompt..', @image`",
362
  fill_height=True,
363
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ @image for image gen, @video-infer for video, default [text, vision]"),
364
  stop_btn="Stop Generation",
 
59
  return cleaned
60
 
61
  # Environment variables and parameters for Stable Diffusion XL
 
62
  MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # SDXL Model repository path via env variable
63
  MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
64
  USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 
204
  Generates chatbot responses with support for multimodal input and image generation.
205
  Special commands:
206
  - "@image": triggers image generation using the SDXL pipeline.
207
+ - "@qwen2vl-video": triggers video processing using Qwen2VL.
208
  """
209
  text = input_dict["text"]
210
  files = input_dict.get("files", [])
 
279
  yield buffer
280
  return
281
 
282
+ # For regular chat (text and multimodal input), process the conversation.
283
+ text = text.strip()
284
+ conversation = clean_chat_history(chat_history)
285
+ conversation.append({"role": "user", "content": text})
286
+
287
  if files:
288
  if len(files) > 1:
289
  images = [load_image(image) for image in files]
 
312
  time.sleep(0.01)
313
  yield buffer
314
  else:
 
 
315
  input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
316
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
317
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
 
359
  ],
360
  cache_examples=False,
361
  type="messages",
362
+ description="# **QwQ Edge @video-infer 'prompt..', @image**",
363
  fill_height=True,
364
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ @image for image gen, @video-infer for video, default [text, vision]"),
365
  stop_btn="Stop Generation",