whisper-asr-uz

Running

mrmuminov commited on May 16

Commit

c53972e

verified ·

1 Parent(s): c4227b5

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,48 +1,49 @@
-import torch
 import gradio as gr
 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
-import numpy as np
 MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
-BATCH_SIZE = 8
-device = 0 if torch.cuda.is_available() else "cpu"
-pipe = pipeline(
-    task="automatic-speech-recognition",
-    model=MODEL_NAME,
-    chunk_length_s=9,
-    device=device,
-    model_kwargs={
-        "attn_implementation": "eager"
-    },
-)
 def transcribe(audio_file):
-    if audio_file is None:
-        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting.")
-    with open(audio_file, "rb") as f:
-        audio_data = f.read()
-    audio_array = ffmpeg_read(audio_data, sampling_rate=pipe.feature_extractor.sampling_rate)
-    duration = len(audio_array) / pipe.feature_extractor.sampling_rate
-    print(f"Audio duration: {duration:.2f} seconds")
-    result = pipe(
-        inputs=audio_array,
-        batch_size=BATCH_SIZE,
-        return_timestamps=False,
-        generate_kwargs={
-            "task": "transcribe",
-            "no_speech_threshold": 0.4,
-            "logprob_threshold": -1.0,
-            "compression_ratio_threshold": 2.4
-        }
-    )
-    return result["text"] if isinstance(result, dict) else result
 demo = gr.Blocks()

+from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import gradio as gr
+import torch
+import torchaudio
 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
 MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
+processor = WhisperProcessor.from_pretrained(MODEL_NAME)
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
 def transcribe(audio_file):
+    global model
+    global processor
+    # Move to GPU if available
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+    # Load and preprocess audio
+    waveform, sample_rate = torchaudio.load(audio_file)
+    if sample_rate != 16000:
+        waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
+    # Convert to mono if needed
+    if waveform.shape[0] > 1:
+        waveform = waveform.mean(dim=0, keepdim=True)
+    # Process audio
+    input_features = processor(
+        waveform.squeeze().numpy(),
+        sampling_rate=16000,
+        return_tensors="pt",
+        language="uz"
+    ).input_features.to(device)
+    # Generate transcription
+    with torch.no_grad():
+        predicted_ids = model.generate(input_features)
+    # Decode
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    return transcription
 demo = gr.Blocks()