Spaces:

prithivMLmods
/

Imgscope-OCR-Mini

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 20

Commit

9b71db9

verified ·

1 Parent(s): 756562d

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -13

app.py CHANGED Viewed

@@ -45,7 +45,6 @@ hermes_llm_model.eval()
 # Load Qwen2-VL processor and model for multimodal tasks
 MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
-# (If needed, you can pass extra arguments such as a size dict here if required.)
 processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID_QWEN,
@@ -91,11 +90,11 @@ DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 # Stable Diffusion XL setup
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") #SG161222/RealVisXL_V5.0_Lightning
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
@@ -262,14 +261,12 @@ def redistribute_codes(code_list, snac_model):
     audio_hat = snac_model.decode(codes)
     return audio_hat.detach().squeeze().cpu().numpy()
-@spaces.GPU()
-def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
     if not text.strip():
         return None
     try:
-        progress(0.1, "Processing text...")
         input_ids, attention_mask = process_prompt(text, voice, orpheus_tts_tokenizer, tts_device)
-        progress(0.3, "Generating speech tokens...")
         with torch.no_grad():
             generated_ids = orpheus_tts_model.generate(
                 input_ids=input_ids,
@@ -282,9 +279,7 @@ def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new
                 num_return_sequences=1,
                 eos_token_id=128258,
             )
-        progress(0.6, "Processing speech tokens...")
         code_list = parse_output(generated_ids)
-        progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
         return (24000, audio_samples)
     except Exception as e:
@@ -389,7 +384,7 @@ def generate(
     for tag, voice in tts_tags.items():
         if lower_text.startswith(tag):
             text = text[len(tag):].strip()
-            # Directly generate speech from the provided text.
             audio_output = generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens)
             yield gr.Audio(audio_output, autoplay=True)
             return
@@ -421,7 +416,7 @@ def generate(
             for new_text in streamer:
                 outputs.append(new_text)
             final_response = "".join(outputs)
-            # Convert LLM response to speech.
             audio_output = generate_speech(final_response, voice, temperature, top_p, repetition_penalty, max_new_tokens)
             yield gr.Audio(audio_output, autoplay=True)
             return
@@ -494,7 +489,6 @@ demo = gr.ChatInterface(
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
         ["@josh-tts Hey! I’m Josh, [gasp] and wow, did I just surprise you with my realistic voice?"],
         ["@dan-llm Explain the General Relativity theorem in short"],
         ["@emma-tts Hey, I’m Emma, [sigh] and yes, I can talk just like a person… even when I’m tired."],
@@ -508,7 +502,6 @@ demo = gr.ChatInterface(
         ["@image Chocolate dripping from a donut"],
         [{"text": "@video-infer Summarize the event in video", "files": ["examples/sky.mp4"]}],
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
     ],
     cache_examples=False,
     type="messages",

 # Load Qwen2-VL processor and model for multimodal tasks
 MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID_QWEN,
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 # Stable Diffusion XL setup
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # e.g. SG161222/RealVisXL_V5.0_Lightning
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     audio_hat = snac_model.decode(codes)
     return audio_hat.detach().squeeze().cpu().numpy()
+def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens):
     if not text.strip():
         return None
     try:
+        # Removed in-function progress calls to maintain UI consistency.
         input_ids, attention_mask = process_prompt(text, voice, orpheus_tts_tokenizer, tts_device)
         with torch.no_grad():
             generated_ids = orpheus_tts_model.generate(
                 input_ids=input_ids,
                 num_return_sequences=1,
                 eos_token_id=128258,
             )
         code_list = parse_output(generated_ids)
         audio_samples = redistribute_codes(code_list, snac_model)
         return (24000, audio_samples)
     except Exception as e:
     for tag, voice in tts_tags.items():
         if lower_text.startswith(tag):
             text = text[len(tag):].strip()
+            yield progress_bar_html("Processing with Orpheus")
             audio_output = generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens)
             yield gr.Audio(audio_output, autoplay=True)
             return
             for new_text in streamer:
                 outputs.append(new_text)
             final_response = "".join(outputs)
+            yield progress_bar_html("Processing with Orpheus")
             audio_output = generate_speech(final_response, voice, temperature, top_p, repetition_penalty, max_new_tokens)
             yield gr.Audio(audio_output, autoplay=True)
             return
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
         ["@josh-tts Hey! I’m Josh, [gasp] and wow, did I just surprise you with my realistic voice?"],
         ["@dan-llm Explain the General Relativity theorem in short"],
         ["@emma-tts Hey, I’m Emma, [sigh] and yes, I can talk just like a person… even when I’m tired."],
         ["@image Chocolate dripping from a donut"],
         [{"text": "@video-infer Summarize the event in video", "files": ["examples/sky.mp4"]}],
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
     ],
     cache_examples=False,
     type="messages",