mrmuminov commited on
Commit
270cde7
·
verified ·
1 Parent(s): 31a57d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -16
app.py CHANGED
@@ -10,16 +10,13 @@ import time
10
  import os
11
  import numpy as np
12
 
13
- # Constants
14
  MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
15
  BATCH_SIZE = 8
16
  FILE_LIMIT_MB = 1000
17
  YT_LENGTH_LIMIT_S = 3600 # 1 hour limit
18
 
19
- # Device selection
20
  device = 0 if torch.cuda.is_available() else "cpu"
21
 
22
- # Load Whisper pipeline
23
  pipe = pipeline(
24
  task="automatic-speech-recognition",
25
  model=MODEL_NAME,
@@ -31,35 +28,29 @@ pipe = pipeline(
31
  },
32
  )
33
 
34
- # Transcription function (Fix applied)
35
- def transcribe(audio_file, task):
36
  if audio_file is None:
37
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting.")
38
 
39
- # Open file as binary to ensure correct data type
40
  with open(audio_file, "rb") as f:
41
  audio_data = f.read()
42
 
43
- # Read audio using ffmpeg_read (correcting input format)
44
  audio_array = ffmpeg_read(audio_data, pipe.feature_extractor.sampling_rate)
45
-
46
  duration = len(audio_array) / pipe.feature_extractor.sampling_rate
47
  print(f"Audio duration: {duration:.2f} seconds")
48
 
49
- # Convert to proper format
50
  inputs = {
51
  "array": np.array(audio_array),
52
  "sampling_rate": pipe.feature_extractor.sampling_rate
53
  }
54
 
55
  generate_kwargs = {
56
- "task": task,
57
- "no_speech_threshold": 0.3,
58
  "logprob_threshold": -1.0,
59
  "compression_ratio_threshold": 2.4
60
  }
61
-
62
- # Perform transcription
63
  result = pipe(
64
  inputs,
65
  batch_size=BATCH_SIZE,
@@ -69,19 +60,16 @@ def transcribe(audio_file, task):
69
 
70
  return result["text"]
71
 
72
- # Gradio UI
73
  demo = gr.Blocks()
74
 
75
  file_transcribe = gr.Interface(
76
  fn=transcribe,
77
  inputs=[
78
  gr.Audio(type="filepath", label="Audio file"),
79
- gr.Radio(["transcribe", "translate"], label="Task"),
80
  ],
81
  outputs="text",
82
  title="Whisper Large V3: Transcribe Audio",
83
  description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",
84
- flagging_mode="never",
85
  )
86
 
87
  with demo:
 
10
  import os
11
  import numpy as np
12
 
 
13
  MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
14
  BATCH_SIZE = 8
15
  FILE_LIMIT_MB = 1000
16
  YT_LENGTH_LIMIT_S = 3600 # 1 hour limit
17
 
 
18
  device = 0 if torch.cuda.is_available() else "cpu"
19
 
 
20
  pipe = pipeline(
21
  task="automatic-speech-recognition",
22
  model=MODEL_NAME,
 
28
  },
29
  )
30
 
31
+ def transcribe(audio_file):
 
32
  if audio_file is None:
33
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting.")
34
 
 
35
  with open(audio_file, "rb") as f:
36
  audio_data = f.read()
37
 
 
38
  audio_array = ffmpeg_read(audio_data, pipe.feature_extractor.sampling_rate)
 
39
  duration = len(audio_array) / pipe.feature_extractor.sampling_rate
40
  print(f"Audio duration: {duration:.2f} seconds")
41
 
 
42
  inputs = {
43
  "array": np.array(audio_array),
44
  "sampling_rate": pipe.feature_extractor.sampling_rate
45
  }
46
 
47
  generate_kwargs = {
48
+ "task": "transcribe",
49
+ "no_speech_threshold": 0.4,
50
  "logprob_threshold": -1.0,
51
  "compression_ratio_threshold": 2.4
52
  }
53
+
 
54
  result = pipe(
55
  inputs,
56
  batch_size=BATCH_SIZE,
 
60
 
61
  return result["text"]
62
 
 
63
  demo = gr.Blocks()
64
 
65
  file_transcribe = gr.Interface(
66
  fn=transcribe,
67
  inputs=[
68
  gr.Audio(type="filepath", label="Audio file"),
 
69
  ],
70
  outputs="text",
71
  title="Whisper Large V3: Transcribe Audio",
72
  description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",
 
73
  )
74
 
75
  with demo: