Spaces:

prithivMLmods
/

core-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 4

Commit

6344f7f

verified ·

1 Parent(s): 803d002

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -25

app.py CHANGED Viewed

@@ -19,15 +19,15 @@ from transformers import (
     TextIteratorStreamer,
     Qwen2VLForConditionalGeneration,
     AutoProcessor,
 )
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 DESCRIPTION = """
 # QwQ Edge 💬
 """
 css = '''
 h1 {
   text-align: center;
@@ -48,7 +48,9 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load text-only model and tokenizer
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -58,19 +60,14 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
 ]
-MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-model_m = Qwen2VLForConditionalGeneration.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to("cuda").eval()
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
@@ -88,14 +85,36 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
-# Environment variables and parameters for Stable Diffusion XL
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
-# Load the SDXL pipeline
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
@@ -104,15 +123,12 @@ sd_pipe = StableDiffusionXLPipeline.from_pretrained(
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-# Ensure that the text encoder is in half-precision if using CUDA.
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
-# Optional: compile the model for speedup if enabled
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
-# Optional: offload parts of the model to CPU if needed
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
@@ -168,7 +184,6 @@ def generate_image_fn(
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
         if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
             batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
-        # Wrap the pipeline call in autocast if using CUDA
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
@@ -193,12 +208,53 @@ def generate(
     Special commands:
       - "@tts1" or "@tts2": triggers text-to-speech.
       - "@image": triggers image generation using the SDXL pipeline.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
     if text.strip().lower().startswith("@image"):
-        # Remove the "@image" tag and use the rest as prompt
         prompt = text[len("@image"):].strip()
         yield "Generating image..."
         image_paths, used_seed = generate_image_fn(
@@ -214,10 +270,12 @@ def generate(
             use_resolution_binning=True,
             num_images=1,
         )
-        # Yield the generated image so that the chat interface displays it.
         yield gr.Image(image_paths[0])
         return  # Exit early
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
@@ -234,6 +292,9 @@ def generate(
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
@@ -263,7 +324,9 @@ def generate(
             time.sleep(0.01)
             yield buffer
     else:
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
@@ -291,8 +354,7 @@ def generate(
         final_response = "".join(outputs)
         yield final_response
-        # If TTS was requested, convert the final response to speech.
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
@@ -307,13 +369,13 @@ demo = gr.ChatInterface(
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
         ["@tts1 Who is Nikola Tesla, and why did he die?"],
-        [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
-        [{"text": "summarize the letter", "files": ["examples/1.png"]}],
         ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
         ["Write a Python function to check if a number is prime."],
         ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",

     TextIteratorStreamer,
     Qwen2VLForConditionalGeneration,
     AutoProcessor,
+    AutoModelForImageTextToText,
 )
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
+# Application description and CSS
 DESCRIPTION = """
 # QwQ Edge 💬
 """
 css = '''
 h1 {
   text-align: center;
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# -------------------------
+# Load Text-only Model
+# -------------------------
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
+# -------------------------
+# TTS Settings
+# -------------------------
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
 ]
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
             cleaned.append(msg)
     return cleaned
+# -------------------------
+# Load Multimodal Model (Qwen2-VL)
+# -------------------------
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model_m = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to("cuda").eval()
+# -------------------------
+# Load Aya-Vision Model (New Feature)
+# -------------------------
+AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
+aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
+aya_model = AutoModelForImageTextToText.from_pretrained(
+    AYA_MODEL_ID, device_map="auto", torch_dtype=torch.float16
+)
+aya_tokenizer = AutoTokenizer.from_pretrained(AYA_MODEL_ID)
+# -------------------------
+# Stable Diffusion XL Settings & Pipeline
+# -------------------------
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
         if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
             batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
     Special commands:
       - "@tts1" or "@tts2": triggers text-to-speech.
       - "@image": triggers image generation using the SDXL pipeline.
+      - "@aya-vision": triggers image-text-to-text generation using the Aya-Vision model.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    # -------------------------
+    # Aya-Vision Feature
+    # -------------------------
+    if text.strip().lower().startswith("@aya-vision"):
+        prompt = text[len("@aya-vision"):].strip()
+        if files:
+            if len(files) > 1:
+                images = [load_image(file) for file in files]
+            elif len(files) == 1:
+                images = [load_image(files[0])]
+            messages = [{
+                "role": "user",
+                "content": [
+                    *[{"type": "image", "image": image} for image in images],
+                    {"type": "text", "text": prompt},
+                ]
+            }]
+        else:
+            messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
+        yield "Processing with Aya-Vision..."
+        inputs = aya_processor.apply_chat_template(
+            messages,
+            padding=True,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(aya_model.device)
+        gen_tokens = aya_model.generate(
+            **inputs,
+            max_new_tokens=300,
+            do_sample=True,
+            temperature=0.3,
+        )
+        gen_text = aya_tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
+        yield gen_text
+        return  # Exit early after processing with Aya-Vision
+    # -------------------------
+    # Image Generation Feature (@image)
+    # -------------------------
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
         yield "Generating image..."
         image_paths, used_seed = generate_image_fn(
             use_resolution_binning=True,
             num_images=1,
         )
         yield gr.Image(image_paths[0])
         return  # Exit early
+    # -------------------------
+    # TTS Feature (@tts1 or @tts2)
+    # -------------------------
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
+    # -------------------------
+    # Multimodal Input (with files) using Qwen2-VL
+    # -------------------------
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
             time.sleep(0.01)
             yield buffer
     else:
+        # -------------------------
+        # Text-only Generation
+        # -------------------------
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         final_response = "".join(outputs)
         yield final_response
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
+        [{"text": "@aya-vision Extract JSON from the image", "files": ["examples/document.jpg"]}],
+        [{"text": "@aya-vision Summarize the letter", "files": ["examples/1.png"]}],
         ["@tts1 Who is Nikola Tesla, and why did he die?"],
         ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
         ["Write a Python function to check if a number is prime."],
         ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",