Spaces:

prithivMLmods
/

Pocket-Callisto

Running on Zero

App Files Files Community

prithivMLmods commited on 8 days ago

Commit

41a2df3

verified ·

1 Parent(s): cc0bb07

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -13

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import torch
 import numpy as np
 from PIL import Image
 import cv2
 from transformers import (
     AutoModelForCausalLM,
@@ -29,7 +30,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load text-only model and tokenizer
 model_id = "prithivMLmods/Pocket-Llama2-3.2-3B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -39,7 +40,8 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
-MODEL_ID = "prithivMLmods/Callisto-OCR3-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
@@ -47,6 +49,19 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
 def clean_chat_history(chat_history):
     """
@@ -59,7 +74,6 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
 def downsample_video(video_path):
     """
     Downsamples the video to 10 evenly spaced frames.
@@ -82,11 +96,10 @@ def downsample_video(video_path):
     vidcap.release()
     return frames
 def progress_bar_html(label: str) -> str:
     """
     Returns an HTML snippet for a thin progress bar with a label.
-    The progress bar is styled as a dark red animated bar.
     """
     return f'''
 <div style="display: flex; align-items: center;">
@@ -103,7 +116,6 @@ def progress_bar_html(label: str) -> str:
 </style>
     '''
 @spaces.GPU
 def generate(input_dict: dict, chat_history: list[dict],
              max_new_tokens: int = 1024,
@@ -112,17 +124,26 @@ def generate(input_dict: dict, chat_history: list[dict],
              top_k: int = 50,
              repetition_penalty: float = 1.2):
     """
-    Generates chatbot responses with support for multimodal input and video processing.
     Special command:
-      - "@video-infer": triggers video processing using Qwen2VL.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
     lower_text = text.strip().lower()
-    # Branch for video processing with Qwen2VL.
     if lower_text.startswith("@video-infer"):
-        prompt = text[len("@video-infer"):].strip()
         if files:
             # Assume the first file is a video.
             video_path = files[0]
@@ -143,7 +164,7 @@ def generate(input_dict: dict, chat_history: list[dict],
                 {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
                 {"role": "user", "content": [{"type": "text", "text": prompt}]}
             ]
-        # Explicitly enable truncation to avoid token/feature mismatch.
         inputs = processor.apply_chat_template(
             messages,
             tokenize=True,
@@ -175,7 +196,7 @@ def generate(input_dict: dict, chat_history: list[dict],
             yield buffer
         return
-    # Normal text or multimodal conversation processing.
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
@@ -212,6 +233,7 @@ def generate(input_dict: dict, chat_history: list[dict],
             time.sleep(0.01)
             yield buffer
     else:
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
@@ -241,6 +263,11 @@ def generate(input_dict: dict, chat_history: list[dict],
         final_response = "".join(outputs)
         yield final_response
 # Create the Gradio ChatInterface with the custom CSS applied
 demo = gr.ChatInterface(
     fn=generate,
@@ -252,10 +279,12 @@ demo = gr.ChatInterface(
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
-        ["Write the code that converts temperatures between celsius and fahrenheit in short"],
         [{"text": "Create a short story based on the image.", "files": ["examples/1.jpg"]}],
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
         [{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
     ],
     cache_examples=False,
     description="# **Pocket Llama**",

 import numpy as np
 from PIL import Image
 import cv2
+import edge_tts
 from transformers import (
     AutoModelForCausalLM,
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load text-only model and tokenizer (Pocket Llama)
 model_id = "prithivMLmods/Pocket-Llama2-3.2-3B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
+# Load multimodal processor and model (Callisto OCR3)
+MODEL_ID = "prithivMLmods/Callisto-OCR3-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.float16
 ).to("cuda").eval()
+# Edge TTS voices mapping for new tags.
+TTS_VOICE_MAP = {
+    "@jennyneural": "en-US-JennyNeural",
+    "@guyneural": "en-US-GuyNeural",
+}
+async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
+    """
+    Convert text to speech using Edge TTS and save as MP3.
+    """
+    communicate = edge_tts.Communicate(text, voice)
+    await communicate.save(output_file)
+    return output_file
 def clean_chat_history(chat_history):
     """
             cleaned.append(msg)
     return cleaned
 def downsample_video(video_path):
     """
     Downsamples the video to 10 evenly spaced frames.
     vidcap.release()
     return frames
 def progress_bar_html(label: str) -> str:
     """
     Returns an HTML snippet for a thin progress bar with a label.
+    The progress bar is styled as a light cyan animated bar.
     """
     return f'''
 <div style="display: flex; align-items: center;">
 </style>
     '''
 @spaces.GPU
 def generate(input_dict: dict, chat_history: list[dict],
              max_new_tokens: int = 1024,
              top_k: int = 50,
              repetition_penalty: float = 1.2):
     """
+    Generates chatbot responses with support for multimodal input, video processing,
+    and Edge TTS when using the new tags @JennyNeural or @GuyNeural.
     Special command:
+      - "@video-infer": triggers video processing using Callisto OCR3.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
     lower_text = text.strip().lower()
+    # Check for TTS tag in the prompt.
+    tts_voice = None
+    for tag, voice in TTS_VOICE_MAP.items():
+        if lower_text.startswith(tag):
+            tts_voice = voice
+            text = text[len(tag):].strip()  # Remove the tag from the prompt.
+            break
+    # Branch for video processing with Callisto OCR3.
     if lower_text.startswith("@video-infer"):
+        prompt = text[len("@video-infer"):].strip() if not tts_voice else text
         if files:
             # Assume the first file is a video.
             video_path = files[0]
                 {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
                 {"role": "user", "content": [{"type": "text", "text": prompt}]}
             ]
+        # Enable truncation to avoid token/feature mismatch.
         inputs = processor.apply_chat_template(
             messages,
             tokenize=True,
             yield buffer
         return
+    # Multimodal processing when files are provided.
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
             time.sleep(0.01)
             yield buffer
     else:
+        # Normal text conversation processing with Pocket Llama.
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         final_response = "".join(outputs)
         yield final_response
+        # If a TTS voice was specified, convert the final response to speech.
+        if tts_voice:
+            output_file = asyncio.run(text_to_speech(final_response, tts_voice))
+            yield gr.Audio(output_file, autoplay=True)
 # Create the Gradio ChatInterface with the custom CSS applied
 demo = gr.ChatInterface(
     fn=generate,
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
+        ["Write the code that converts temperatures between Celsius and Fahrenheit in short"],
         [{"text": "Create a short story based on the image.", "files": ["examples/1.jpg"]}],
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
         [{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
+        ["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
+        ["@GuyNeural Explain how rainbows are formed."]
     ],
     cache_examples=False,
     description="# **Pocket Llama**",