Spaces:

prithivMLmods
/

Imgscope-OCR-Mini

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 20

Commit

40993df

verified ·

1 Parent(s): fff10dd

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -100

app.py CHANGED Viewed

@@ -21,7 +21,6 @@ from transformers import (
     AutoProcessor,
 )
 from transformers.image_utils import load_image
-from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 # Additional imports for new TTS
 from snac import SNAC
@@ -43,7 +42,7 @@ hermes_llm_model = AutoModelForCausalLM.from_pretrained(
 )
 hermes_llm_model.eval()
-# Load Qwen2-VL processor and model for multimodal tasks
 MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -84,32 +83,12 @@ orpheus_tts_model.to(tts_device)
 orpheus_tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_name)
 print(f"Orpheus TTS model loaded to {tts_device}")
-# Some global parameters for chat and image generation
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-# Stable Diffusion XL setup
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # e.g. SG161222/RealVisXL_V5.0_Lightning
-MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
-USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
-ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
-sd_pipe = StableDiffusionXLPipeline.from_pretrained(
-    MODEL_ID_SD,
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    use_safetensors=True,
-    add_watermarker=False,
-).to(device)
-sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-if torch.cuda.is_available():
-    sd_pipe.text_encoder = sd_pipe.text_encoder.half()
-if USE_TORCH_COMPILE:
-    sd_pipe.compile()
-if ENABLE_CPU_OFFLOAD:
-    sd_pipe.enable_model_cpu_offload()
 MAX_SEED = np.iinfo(np.int32).max
@@ -164,50 +143,6 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
-@spaces.GPU(duration=60, enable_queue=True)
-def generate_image_fn(
-    prompt: str,
-    negative_prompt: str = "",
-    use_negative_prompt: bool = False,
-    seed: int = 1,
-    width: int = 1024,
-    height: int = 1024,
-    guidance_scale: float = 3,
-    num_inference_steps: int = 25,
-    randomize_seed: bool = False,
-    use_resolution_binning: bool = True,
-    num_images: int = 1,
-    progress=gr.Progress(track_tqdm=True),
-):
-    seed = int(randomize_seed_fn(seed, randomize_seed))
-    generator = torch.Generator(device=device).manual_seed(seed)
-    options = {
-        "prompt": [prompt] * num_images,
-        "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
-        "width": width,
-        "height": height,
-        "guidance_scale": guidance_scale,
-        "num_inference_steps": num_inference_steps,
-        "generator": generator,
-        "output_type": "pil",
-    }
-    if use_resolution_binning:
-        options["use_resolution_binning"] = True
-    images = []
-    for i in range(0, num_images, BATCH_SIZE):
-        batch_options = options.copy()
-        batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
-        if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
-            batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
-        if device.type == "cuda":
-            with torch.autocast("cuda", dtype=torch.float16):
-                outputs = sd_pipe(**batch_options)
-        else:
-            outputs = sd_pipe(**batch_options)
-        images.extend(outputs.images)
-    image_paths = [save_image(img) for img in images]
-    return image_paths, seed
 # New TTS functions (SNAC/Orpheus pipeline)
 def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
@@ -298,11 +233,10 @@ def generate(
     repetition_penalty: float = 1.2,
 ):
     """
-    Generates chatbot responses with support for multimodal input, image generation,
     TTS, and LLM-augmented TTS.
     Trigger commands:
-      - "@image": generate an image.
       - "@video-infer": process video.
       - "@<voice>-tts": directly convert text to speech.
       - "@<voice>-llm": infer with the DeepHermes Llama model then convert to speech.
@@ -311,26 +245,6 @@ def generate(
     files = input_dict.get("files", [])
     lower_text = text.strip().lower()
-    # Branch for image generation.
-    if lower_text.startswith("@image"):
-        prompt = text[len("@image"):].strip()
-        yield progress_bar_html("Generating Image")
-        image_paths, used_seed = generate_image_fn(
-            prompt=prompt,
-            negative_prompt="",
-            use_negative_prompt=False,
-            seed=1,
-            width=1024,
-            height=1024,
-            guidance_scale=3,
-            num_inference_steps=25,
-            randomize_seed=True,
-            use_resolution_binning=True,
-            num_images=1,
-        )
-        yield gr.Image(image_paths[0])
-        return
     # Branch for video processing.
     if lower_text.startswith("@video-infer"):
         prompt = text[len("@video-infer"):].strip()
@@ -424,28 +338,30 @@ def generate(
     # Default branch for regular chat (text and multimodal without TTS).
     conversation = clean_chat_history(chat_history)
     conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
-            images = [load_image(image) for image in files]
         elif len(files) == 1:
-            images = [load_image(files[0])]
         else:
-            images = []
         messages = [{
             "role": "user",
             "content": [
-                *[{"type": "image", "image": image} for image in images],
                 {"type": "text", "text": text},
             ]
         }]
         prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text=[prompt_full], images=images, return_tensors="pt", padding=True).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
-        yield progress_bar_html("Processing Qwen2VL")
         for new_text in streamer:
             buffer += new_text.replace("<|im_end|>", "")
             time.sleep(0.01)
@@ -496,16 +412,14 @@ demo = gr.ChatInterface(
         ["@dan-tts Yo, I’m Dan, [groan] and yes, I can even sound annoyed if I have to."],
         ["Write python program for array rotation"],
         ["@tara-tts Hey there, my name is Tara, [laugh] and I’m a speech generation model that can sound just like you!"],
-        [{"text": "summarize the letter", "files": ["examples/1.png"]}],
         ["@tara-llm Who is Nikola Tesla, and why did he die?"],
         ["@emma-llm Explain the causes of rainbows"],
-        ["@image Chocolate dripping from a donut"],
         [{"text": "@video-infer Summarize the event in video", "files": ["examples/sky.mp4"]}],
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
     ],
     cache_examples=False,
     type="messages",
-    description="# **Orpheus Edge🧤** `voice: tara, dan, emma, josh` \n `emotion: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>. Use @video-infer, @image, orpheus: @<voice>-tts, or @<voice>-llm triggers llm response`",
     fill_height=True,
     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ Use @tara-tts/@dan-tts for direct TTS or @tara-llm/@dan-llm for LLM+TTS, etc."),
     stop_btn="Stop Generation",
@@ -513,4 +427,4 @@ demo = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch(share=True)

     AutoProcessor,
 )
 from transformers.image_utils import load_image
 # Additional imports for new TTS
 from snac import SNAC
 )
 hermes_llm_model.eval()
+# Load Qwen2-VL processor and model for multimodal tasks (e.g. video processing)
 MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
 orpheus_tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_name)
 print(f"Orpheus TTS model loaded to {tts_device}")
+# Some global parameters for chat responses
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+# (Image generation related code has been fully removed.)
 MAX_SEED = np.iinfo(np.int32).max
             cleaned.append(msg)
     return cleaned
 # New TTS functions (SNAC/Orpheus pipeline)
 def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
     repetition_penalty: float = 1.2,
 ):
     """
+    Generates chatbot responses with support for multimodal input, video processing,
     TTS, and LLM-augmented TTS.
     Trigger commands:
       - "@video-infer": process video.
       - "@<voice>-tts": directly convert text to speech.
       - "@<voice>-llm": infer with the DeepHermes Llama model then convert to speech.
     files = input_dict.get("files", [])
     lower_text = text.strip().lower()
     # Branch for video processing.
     if lower_text.startswith("@video-infer"):
         prompt = text[len("@video-infer"):].strip()
     # Default branch for regular chat (text and multimodal without TTS).
     conversation = clean_chat_history(chat_history)
     conversation.append({"role": "user", "content": text})
+    # If files are provided, only non-image files (e.g. video) are processed via Qwen2VL.
     if files:
+        # Process files using the processor (this branch no longer handles image generation)
         if len(files) > 1:
+            inputs_list = [load_image(image) for image in files]
         elif len(files) == 1:
+            inputs_list = [load_image(files[0])]
         else:
+            inputs_list = []
         messages = [{
             "role": "user",
             "content": [
+                *[{"type": "image", "image": img} for img in inputs_list],
                 {"type": "text", "text": text},
             ]
         }]
         prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[prompt_full], images=inputs_list, return_tensors="pt", padding=True).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
+        yield progress_bar_html("Processing with Qwen2VL")
         for new_text in streamer:
             buffer += new_text.replace("<|im_end|>", "")
             time.sleep(0.01)
         ["@dan-tts Yo, I’m Dan, [groan] and yes, I can even sound annoyed if I have to."],
         ["Write python program for array rotation"],
         ["@tara-tts Hey there, my name is Tara, [laugh] and I’m a speech generation model that can sound just like you!"],
         ["@tara-llm Who is Nikola Tesla, and why did he die?"],
         ["@emma-llm Explain the causes of rainbows"],
         [{"text": "@video-infer Summarize the event in video", "files": ["examples/sky.mp4"]}],
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
     ],
     cache_examples=False,
     type="messages",
+    description="# **Orpheus Edge🧤** `voice: tara, dan, emma, josh` \n `emotion: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>. Use @video-infer, orpheus: @<voice>-tts, or @<voice>-llm triggers llm response`",
     fill_height=True,
     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ Use @tara-tts/@dan-tts for direct TTS or @tara-llm/@dan-llm for LLM+TTS, etc."),
     stop_btn="Stop Generation",
 )
 if __name__ == "__main__":
+    demo.queue(max_size=30).launch(share=True)