mrmuminov commited on
Commit
c53972e
·
verified ·
1 Parent(s): c4227b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -38
app.py CHANGED
@@ -1,48 +1,49 @@
1
- import torch
2
  import gradio as gr
 
 
3
  from transformers import pipeline
4
  from transformers.pipelines.audio_utils import ffmpeg_read
5
- import numpy as np
6
 
7
  MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
8
- BATCH_SIZE = 8
9
-
10
- device = 0 if torch.cuda.is_available() else "cpu"
11
-
12
- pipe = pipeline(
13
- task="automatic-speech-recognition",
14
- model=MODEL_NAME,
15
- chunk_length_s=9,
16
- device=device,
17
- model_kwargs={
18
- "attn_implementation": "eager"
19
- },
20
- )
21
 
22
  def transcribe(audio_file):
23
- if audio_file is None:
24
- raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting.")
25
-
26
- with open(audio_file, "rb") as f:
27
- audio_data = f.read()
28
-
29
- audio_array = ffmpeg_read(audio_data, sampling_rate=pipe.feature_extractor.sampling_rate)
30
- duration = len(audio_array) / pipe.feature_extractor.sampling_rate
31
- print(f"Audio duration: {duration:.2f} seconds")
32
-
33
- result = pipe(
34
- inputs=audio_array,
35
- batch_size=BATCH_SIZE,
36
- return_timestamps=False,
37
- generate_kwargs={
38
- "task": "transcribe",
39
- "no_speech_threshold": 0.4,
40
- "logprob_threshold": -1.0,
41
- "compression_ratio_threshold": 2.4
42
- }
43
- )
44
-
45
- return result["text"] if isinstance(result, dict) else result
 
 
 
 
 
 
 
 
 
46
 
47
  demo = gr.Blocks()
48
 
 
1
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
  import gradio as gr
3
+ import torch
4
+ import torchaudio
5
  from transformers import pipeline
6
  from transformers.pipelines.audio_utils import ffmpeg_read
 
7
 
8
  MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
9
+
10
+ processor = WhisperProcessor.from_pretrained(MODEL_NAME)
11
+ model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
12
+
 
 
 
 
 
 
 
 
 
13
 
14
  def transcribe(audio_file):
15
+
16
+ global model
17
+ global processor
18
+
19
+ # Move to GPU if available
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
+ model = model.to(device)
22
+
23
+ # Load and preprocess audio
24
+ waveform, sample_rate = torchaudio.load(audio_file)
25
+ if sample_rate != 16000:
26
+ waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
27
+
28
+ # Convert to mono if needed
29
+ if waveform.shape[0] > 1:
30
+ waveform = waveform.mean(dim=0, keepdim=True)
31
+
32
+ # Process audio
33
+ input_features = processor(
34
+ waveform.squeeze().numpy(),
35
+ sampling_rate=16000,
36
+ return_tensors="pt",
37
+ language="uz"
38
+ ).input_features.to(device)
39
+
40
+ # Generate transcription
41
+ with torch.no_grad():
42
+ predicted_ids = model.generate(input_features)
43
+
44
+ # Decode
45
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
46
+ return transcription
47
 
48
  demo = gr.Blocks()
49