Spaces:

prithivMLmods
/

Imgscope-OCR-Mini

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 20

Commit

0be8fb1

verified ·

1 Parent(s): eceb410

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -8

app.py CHANGED Viewed

@@ -88,7 +88,7 @@ MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-# (Image generation related code has been fully removed.)
 MAX_SEED = np.iinfo(np.int32).max
@@ -200,13 +200,14 @@ def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new
     if not text.strip():
         return None
     try:
-        # Removed in-function progress calls to maintain UI consistency.
         input_ids, attention_mask = process_prompt(text, voice, orpheus_tts_tokenizer, tts_device)
         with torch.no_grad():
             generated_ids = orpheus_tts_model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                max_new_tokens=max_new_tokens,
                 do_sample=True,
                 temperature=temperature,
                 top_p=top_p,
@@ -233,7 +234,7 @@ def generate(
     repetition_penalty: float = 1.2,
 ):
     """
-    Generates chatbot responses with support for multimodal input, video processing,
     TTS, and LLM-augmented TTS.
     Trigger commands:
@@ -299,7 +300,8 @@ def generate(
         if lower_text.startswith(tag):
             text = text[len(tag):].strip()
             yield progress_bar_html("Processing with Orpheus")
-            audio_output = generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens)
             yield gr.Audio(audio_output, autoplay=True)
             return
@@ -331,16 +333,15 @@ def generate(
                 outputs.append(new_text)
             final_response = "".join(outputs)
             yield progress_bar_html("Processing with Orpheus")
-            audio_output = generate_speech(final_response, voice, temperature, top_p, repetition_penalty, max_new_tokens)
             yield gr.Audio(audio_output, autoplay=True)
             return
     # Default branch for regular chat (text and multimodal without TTS).
     conversation = clean_chat_history(chat_history)
     conversation.append({"role": "user", "content": text})
-    # If files are provided, only non-image files (e.g. video) are processed via Qwen2VL.
     if files:
-        # Process files using the processor (this branch no longer handles image generation)
         if len(files) > 1:
             inputs_list = [load_image(image) for image in files]
         elif len(files) == 1:

 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+# (Image generation related code has been removed.)
 MAX_SEED = np.iinfo(np.int32).max
     if not text.strip():
         return None
     try:
+        # For TTS we ensure at least 2048 tokens are generated
+        tts_tokens = max(max_new_tokens, 2048)
         input_ids, attention_mask = process_prompt(text, voice, orpheus_tts_tokenizer, tts_device)
         with torch.no_grad():
             generated_ids = orpheus_tts_model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                max_new_tokens=tts_tokens,
                 do_sample=True,
                 temperature=temperature,
                 top_p=top_p,
     repetition_penalty: float = 1.2,
 ):
     """
+    Generates chatbot responses with support for video processing,
     TTS, and LLM-augmented TTS.
     Trigger commands:
         if lower_text.startswith(tag):
             text = text[len(tag):].strip()
             yield progress_bar_html("Processing with Orpheus")
+            # Use at least 2048 tokens for TTS to cover full text
+            audio_output = generate_speech(text, voice, temperature, top_p, repetition_penalty, max(max_new_tokens, 2048))
             yield gr.Audio(audio_output, autoplay=True)
             return
                 outputs.append(new_text)
             final_response = "".join(outputs)
             yield progress_bar_html("Processing with Orpheus")
+            audio_output = generate_speech(final_response, voice, temperature, top_p, repetition_penalty, max(max_new_tokens, 2048))
             yield gr.Audio(audio_output, autoplay=True)
             return
     # Default branch for regular chat (text and multimodal without TTS).
     conversation = clean_chat_history(chat_history)
     conversation.append({"role": "user", "content": text})
+    # If files are provided, process them using the processor (assumed to be video if not image)
     if files:
         if len(files) > 1:
             inputs_list = [load_image(image) for image in files]
         elif len(files) == 1: