KIFF commited on
Commit
16ebd09
·
verified ·
1 Parent(s): a4ab387

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -34
app.py CHANGED
@@ -1,50 +1,46 @@
1
  import torch
2
  import gradio as gr
3
- from transformers import pipeline
4
  import numpy as np
5
- from pydub import AudioSegment
6
- import io
7
 
8
- MODEL_NAME = "openai/whisper-large-v3"
9
- BATCH_SIZE = 8
10
 
11
- device = 0 if torch.cuda.is_available() else "cpu"
 
12
 
13
- pipe = pipeline(
14
- task="automatic-speech-recognition",
15
- model=MODEL_NAME,
16
- chunk_length_s=30,
17
- device=device,
18
- )
 
 
 
 
 
 
 
19
 
20
  def transcribe(audio_file, task):
21
  if audio_file is None:
22
  raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")
23
 
24
  try:
25
- # Read the audio file
26
- audio = AudioSegment.from_file(audio_file)
27
-
28
- # Convert to mono if stereo
29
- if audio.channels > 1:
30
- audio = audio.set_channels(1)
31
-
32
- # Convert to 16kHz sample rate
33
- audio = audio.set_frame_rate(16000)
34
-
35
- # Convert to numpy array
36
- samples = np.array(audio.get_array_of_samples()).astype(np.float32) / 32768.0
37
 
38
- # Convert to the format expected by Whisper
39
- inputs = {"array": samples, "sampling_rate": 16000}
40
 
41
- result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
42
-
43
  output = ""
44
- for chunk in result["chunks"]:
45
- start_time = chunk["timestamp"][0]
46
- end_time = chunk["timestamp"][1]
47
- text = chunk["text"]
48
  output += f"[{format_timestamp(start_time)} -> {format_timestamp(end_time)}] {text}\n"
49
 
50
  return output
@@ -66,9 +62,9 @@ demo = gr.Interface(
66
  fn=transcribe,
67
  inputs=[audio_input, task_input],
68
  outputs=output,
69
- title=f"Whisper Large V3: Transcribe Audio with Timestamps",
70
  description=(
71
- f"Transcribe audio files with Whisper Large V3 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}). "
72
  "Upload an audio file and choose whether to transcribe or translate. "
73
  "The output includes timestamps for each transcribed segment."
74
  ),
 
1
  import torch
2
  import gradio as gr
3
+ import ffmpeg
4
  import numpy as np
5
+ import whisper
 
6
 
7
+ MODEL_NAME = "large-v3"
8
+ SAMPLE_RATE = 16000
9
 
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+ model = whisper.load_model(MODEL_NAME).to(device)
12
 
13
+ def load_audio(file):
14
+ try:
15
+ # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
16
+ # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
17
+ out, _ = (
18
+ ffmpeg.input(file, threads=0)
19
+ .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=SAMPLE_RATE)
20
+ .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
21
+ )
22
+ except ffmpeg.Error as e:
23
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
24
+
25
+ return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
26
 
27
  def transcribe(audio_file, task):
28
  if audio_file is None:
29
  raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")
30
 
31
  try:
32
+ # Load audio
33
+ audio = load_audio(audio_file.name)
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # Transcribe
36
+ result = model.transcribe(audio, task=task, language="en")
37
 
38
+ # Format output
 
39
  output = ""
40
+ for segment in result["segments"]:
41
+ start_time = segment["start"]
42
+ end_time = segment["end"]
43
+ text = segment["text"]
44
  output += f"[{format_timestamp(start_time)} -> {format_timestamp(end_time)}] {text}\n"
45
 
46
  return output
 
62
  fn=transcribe,
63
  inputs=[audio_input, task_input],
64
  outputs=output,
65
+ title=f"Whisper {MODEL_NAME}: Transcribe Audio with Timestamps",
66
  description=(
67
+ f"Transcribe audio files with Whisper {MODEL_NAME}. "
68
  "Upload an audio file and choose whether to transcribe or translate. "
69
  "The output includes timestamps for each transcribed segment."
70
  ),