Spaces:

prithivMLmods
/

Imgscope-OCR-Mini

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 20

Commit

a5f372d

verified ·

1 Parent(s): 0be8fb1

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -9

app.py CHANGED Viewed

@@ -88,7 +88,7 @@ MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-# (Image generation related code has been removed.)
 MAX_SEED = np.iinfo(np.int32).max
@@ -200,14 +200,13 @@ def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new
     if not text.strip():
         return None
     try:
-        # For TTS we ensure at least 2048 tokens are generated
-        tts_tokens = max(max_new_tokens, 2048)
         input_ids, attention_mask = process_prompt(text, voice, orpheus_tts_tokenizer, tts_device)
         with torch.no_grad():
             generated_ids = orpheus_tts_model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                max_new_tokens=tts_tokens,
                 do_sample=True,
                 temperature=temperature,
                 top_p=top_p,
@@ -234,7 +233,7 @@ def generate(
     repetition_penalty: float = 1.2,
 ):
     """
-    Generates chatbot responses with support for video processing,
     TTS, and LLM-augmented TTS.
     Trigger commands:
@@ -300,8 +299,7 @@ def generate(
         if lower_text.startswith(tag):
             text = text[len(tag):].strip()
             yield progress_bar_html("Processing with Orpheus")
-            # Use at least 2048 tokens for TTS to cover full text
-            audio_output = generate_speech(text, voice, temperature, top_p, repetition_penalty, max(max_new_tokens, 2048))
             yield gr.Audio(audio_output, autoplay=True)
             return
@@ -333,15 +331,16 @@ def generate(
                 outputs.append(new_text)
             final_response = "".join(outputs)
             yield progress_bar_html("Processing with Orpheus")
-            audio_output = generate_speech(final_response, voice, temperature, top_p, repetition_penalty, max(max_new_tokens, 2048))
             yield gr.Audio(audio_output, autoplay=True)
             return
     # Default branch for regular chat (text and multimodal without TTS).
     conversation = clean_chat_history(chat_history)
     conversation.append({"role": "user", "content": text})
-    # If files are provided, process them using the processor (assumed to be video if not image)
     if files:
         if len(files) > 1:
             inputs_list = [load_image(image) for image in files]
         elif len(files) == 1:
@@ -412,6 +411,7 @@ demo = gr.ChatInterface(
         ["@josh-llm What causes rainbows to form?"],
         ["@dan-tts Yo, I’m Dan, [groan] and yes, I can even sound annoyed if I have to."],
         ["Write python program for array rotation"],
         ["@tara-tts Hey there, my name is Tara, [laugh] and I’m a speech generation model that can sound just like you!"],
         ["@tara-llm Who is Nikola Tesla, and why did he die?"],
         ["@emma-llm Explain the causes of rainbows"],

 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+# (Image generation related code has been fully removed.)
 MAX_SEED = np.iinfo(np.int32).max
     if not text.strip():
         return None
     try:
+        # Removed in-function progress calls to maintain UI consistency.
         input_ids, attention_mask = process_prompt(text, voice, orpheus_tts_tokenizer, tts_device)
         with torch.no_grad():
             generated_ids = orpheus_tts_model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                max_new_tokens=max_new_tokens,
                 do_sample=True,
                 temperature=temperature,
                 top_p=top_p,
     repetition_penalty: float = 1.2,
 ):
     """
+    Generates chatbot responses with support for multimodal input, video processing,
     TTS, and LLM-augmented TTS.
     Trigger commands:
         if lower_text.startswith(tag):
             text = text[len(tag):].strip()
             yield progress_bar_html("Processing with Orpheus")
+            audio_output = generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens)
             yield gr.Audio(audio_output, autoplay=True)
             return
                 outputs.append(new_text)
             final_response = "".join(outputs)
             yield progress_bar_html("Processing with Orpheus")
+            audio_output = generate_speech(final_response, voice, temperature, top_p, repetition_penalty, max_new_tokens)
             yield gr.Audio(audio_output, autoplay=True)
             return
     # Default branch for regular chat (text and multimodal without TTS).
     conversation = clean_chat_history(chat_history)
     conversation.append({"role": "user", "content": text})
+    # If files are provided, only non-image files (e.g. video) are processed via Qwen2VL.
     if files:
+        # Process files using the processor (this branch no longer handles image generation)
         if len(files) > 1:
             inputs_list = [load_image(image) for image in files]
         elif len(files) == 1:
         ["@josh-llm What causes rainbows to form?"],
         ["@dan-tts Yo, I’m Dan, [groan] and yes, I can even sound annoyed if I have to."],
         ["Write python program for array rotation"],
+        [{"text": "summarize the letter", "files": ["examples/1.png"]}],
         ["@tara-tts Hey there, my name is Tara, [laugh] and I’m a speech generation model that can sound just like you!"],
         ["@tara-llm Who is Nikola Tesla, and why did he die?"],
         ["@emma-llm Explain the causes of rainbows"],