KIFF commited on
Commit
674036d
·
verified ·
1 Parent(s): 74520b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -17
app.py CHANGED
@@ -2,7 +2,8 @@ import torch
2
  import gradio as gr
3
  from transformers import pipeline
4
  import numpy as np
5
- import librosa
 
6
 
7
  MODEL_NAME = "openai/whisper-large-v3"
8
  BATCH_SIZE = 8
@@ -20,26 +21,36 @@ def transcribe(audio_file, task):
20
  if audio_file is None:
21
  raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")
22
 
23
- # Load audio file
24
  try:
25
- # Use librosa to load the audio file
26
- audio, sr = librosa.load(audio_file, sr=16000) # Whisper expects 16kHz sampling rate
27
- except Exception as e:
28
- raise gr.Error(f"Error loading audio file: {str(e)}")
 
 
 
 
 
 
 
 
29
 
30
- # Convert to format expected by Whisper
31
- inputs = {"array": audio, "sampling_rate": sr}
32
 
33
- result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
34
-
35
- output = ""
36
- for chunk in result["chunks"]:
37
- start_time = chunk["timestamp"][0]
38
- end_time = chunk["timestamp"][1]
39
- text = chunk["text"]
40
- output += f"[{format_timestamp(start_time)} -> {format_timestamp(end_time)}] {text}\n"
41
 
42
- return output
 
 
 
43
 
44
  def format_timestamp(seconds):
45
  minutes, seconds = divmod(seconds, 60)
 
2
  import gradio as gr
3
  from transformers import pipeline
4
  import numpy as np
5
+ from pydub import AudioSegment
6
+ import io
7
 
8
  MODEL_NAME = "openai/whisper-large-v3"
9
  BATCH_SIZE = 8
 
21
  if audio_file is None:
22
  raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")
23
 
 
24
  try:
25
+ # Read the audio file
26
+ audio = AudioSegment.from_file(audio_file)
27
+
28
+ # Convert to mono if stereo
29
+ if audio.channels > 1:
30
+ audio = audio.set_channels(1)
31
+
32
+ # Convert to 16kHz sample rate
33
+ audio = audio.set_frame_rate(16000)
34
+
35
+ # Convert to numpy array
36
+ samples = np.array(audio.get_array_of_samples()).astype(np.float32) / 32768.0
37
 
38
+ # Convert to the format expected by Whisper
39
+ inputs = {"array": samples, "sampling_rate": 16000}
40
 
41
+ result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
42
+
43
+ output = ""
44
+ for chunk in result["chunks"]:
45
+ start_time = chunk["timestamp"][0]
46
+ end_time = chunk["timestamp"][1]
47
+ text = chunk["text"]
48
+ output += f"[{format_timestamp(start_time)} -> {format_timestamp(end_time)}] {text}\n"
49
 
50
+ return output
51
+
52
+ except Exception as e:
53
+ raise gr.Error(f"Error processing audio file: {str(e)}")
54
 
55
  def format_timestamp(seconds):
56
  minutes, seconds = divmod(seconds, 60)