Spaces:

prithivMLmods
/

Imgscope-OCR-Mini

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 20

Commit

6d1a1b7

verified ·

1 Parent(s): 0d63337

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -8

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import spaces
 import torch
 import numpy as np
 from PIL import Image
 import cv2
 from transformers import (
@@ -30,7 +31,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load text-only model and tokenizer
-model_id = "prithivMLmods/DeepHermes-3-Llama-3-3B-Preview-abliterated"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
@@ -39,6 +40,11 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -47,6 +53,12 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
@@ -59,6 +71,7 @@ def clean_chat_history(chat_history):
     return cleaned
 # Environment variables and parameters for Stable Diffusion XL
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
@@ -201,8 +214,9 @@ def generate(
     repetition_penalty: float = 1.2,
 ):
     """
-    Generates chatbot responses with support for multimodal input and image generation.
     Special commands:
       - "@image": triggers image generation using the SDXL pipeline.
       - "@qwen2vl-video": triggers video processing using Qwen2VL.
     """
@@ -279,10 +293,20 @@ def generate(
             yield buffer
         return
-    # For regular chat (text and multimodal input), process the conversation.
-    text = text.strip()
-    conversation = clean_chat_history(chat_history)
-    conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
@@ -338,6 +362,9 @@ def generate(
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
 demo = gr.ChatInterface(
     fn=generate,
@@ -354,14 +381,16 @@ demo = gr.ChatInterface(
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
         ["@image Chocolate dripping from a donut"],
         ["Python Program for Array Rotation"],
         [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
         [{"text": "summarize the letter", "files": ["examples/1.png"]}],
     ],
     cache_examples=False,
     type="messages",
-    description="# **QwQ Edge @video-infer 'prompt..', @image**",
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ @image for image gen, @video-infer for video, default [text, vision]"),
     stop_btn="Stop Generation",
     multimodal=True,
 )

 import torch
 import numpy as np
 from PIL import Image
+import edge_tts
 import cv2
 from transformers import (
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load text-only model and tokenizer
+model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
 )
 model.eval()
+TTS_VOICES = [
+    "en-US-JennyNeural",  # @tts1
+    "en-US-GuyNeural",    # @tts2
+]
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
+async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
+    """Convert text to speech using Edge TTS and save as MP3"""
+    communicate = edge_tts.Communicate(text, voice)
+    await communicate.save(output_file)
+    return output_file
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
     return cleaned
 # Environment variables and parameters for Stable Diffusion XL
+# Use : SG161222/RealVisXL_V4.0_Lightning or SG161222/RealVisXL_V5.0_Lightning
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
     repetition_penalty: float = 1.2,
 ):
     """
+    Generates chatbot responses with support for multimodal input, TTS, and image generation.
     Special commands:
+      - "@tts1" or "@tts2": triggers text-to-speech.
       - "@image": triggers image generation using the SDXL pipeline.
       - "@qwen2vl-video": triggers video processing using Qwen2VL.
     """
             yield buffer
         return
+    # Determine if TTS is requested.
+    tts_prefix = "@tts"
+    is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
+    voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
+    if is_tts and voice_index:
+        voice = TTS_VOICES[voice_index - 1]
+        text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
+        conversation = [{"role": "user", "content": text}]
+    else:
+        voice = None
+        text = text.replace(tts_prefix, "").strip()
+        conversation = clean_chat_history(chat_history)
+        conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
+        if is_tts and voice:
+            output_file = asyncio.run(text_to_speech(final_response, voice))
+            yield gr.Audio(output_file, autoplay=True)
 demo = gr.ChatInterface(
     fn=generate,
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
         ["@image Chocolate dripping from a donut"],
         ["Python Program for Array Rotation"],
+        ["@tts1 Who is Nikola Tesla, and why did he die?"],
         [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
         [{"text": "summarize the letter", "files": ["examples/1.png"]}],
+        ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",
+    description="# **QwQ Edge `@video-infer 'prompt..', @image, @tts1`**",
     fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ @tts1, @tts2-voices, @image for image gen, @video-infer for video, default [text, vision]"),
     stop_btn="Stop Generation",
     multimodal=True,
 )