Spaces:

prithivMLmods
/

Imgscope-OCR-Mini

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 20

Commit

bd29f11

verified ·

1 Parent(s): bd5bebf

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -6

app.py CHANGED Viewed

@@ -59,7 +59,6 @@ def clean_chat_history(chat_history):
     return cleaned
 # Environment variables and parameters for Stable Diffusion XL
-# Use : SG161222/RealVisXL_V4.0_Lightning or SG161222/RealVisXL_V5.0_Lightning
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
@@ -205,7 +204,7 @@ def generate(
     Generates chatbot responses with support for multimodal input and image generation.
     Special commands:
       - "@image": triggers image generation using the SDXL pipeline.
-      - "@video-infer": triggers video processing using Qwen2VL.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -280,7 +279,11 @@ def generate(
             yield buffer
         return
-    # Process as text and/or image input.
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
@@ -309,8 +312,6 @@ def generate(
             time.sleep(0.01)
             yield buffer
     else:
-        conversation = clean_chat_history(chat_history)
-        conversation.append({"role": "user", "content": text})
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
@@ -358,7 +359,7 @@ demo = gr.ChatInterface(
     ],
     cache_examples=False,
     type="messages",
-    description="# **Llama Edge** \n`@video-infer 'prompt..', @image`",
     fill_height=True,
     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ @image for image gen, @video-infer for video, default [text, vision]"),
     stop_btn="Stop Generation",

     return cleaned
 # Environment variables and parameters for Stable Diffusion XL
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
     Generates chatbot responses with support for multimodal input and image generation.
     Special commands:
       - "@image": triggers image generation using the SDXL pipeline.
+      - "@qwen2vl-video": triggers video processing using Qwen2VL.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
             yield buffer
         return
+    # For regular chat (text and multimodal input), process the conversation.
+    text = text.strip()
+    conversation = clean_chat_history(chat_history)
+    conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
             time.sleep(0.01)
             yield buffer
     else:
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
     ],
     cache_examples=False,
     type="messages",
+    description="# **QwQ Edge @video-infer 'prompt..', @image**",
     fill_height=True,
     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ @image for image gen, @video-infer for video, default [text, vision]"),
     stop_btn="Stop Generation",