Update app.py
Browse files
app.py
CHANGED
@@ -2,7 +2,8 @@ import torch
|
|
2 |
import gradio as gr
|
3 |
from transformers import pipeline
|
4 |
import numpy as np
|
5 |
-
import
|
|
|
6 |
|
7 |
MODEL_NAME = "openai/whisper-large-v3"
|
8 |
BATCH_SIZE = 8
|
@@ -20,26 +21,36 @@ def transcribe(audio_file, task):
|
|
20 |
if audio_file is None:
|
21 |
raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")
|
22 |
|
23 |
-
# Load audio file
|
24 |
try:
|
25 |
-
#
|
26 |
-
audio
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
|
31 |
-
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
|
42 |
-
|
|
|
|
|
|
|
43 |
|
44 |
def format_timestamp(seconds):
|
45 |
minutes, seconds = divmod(seconds, 60)
|
|
|
2 |
import gradio as gr
|
3 |
from transformers import pipeline
|
4 |
import numpy as np
|
5 |
+
from pydub import AudioSegment
|
6 |
+
import io
|
7 |
|
8 |
MODEL_NAME = "openai/whisper-large-v3"
|
9 |
BATCH_SIZE = 8
|
|
|
21 |
if audio_file is None:
|
22 |
raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")
|
23 |
|
|
|
24 |
try:
|
25 |
+
# Read the audio file
|
26 |
+
audio = AudioSegment.from_file(audio_file)
|
27 |
+
|
28 |
+
# Convert to mono if stereo
|
29 |
+
if audio.channels > 1:
|
30 |
+
audio = audio.set_channels(1)
|
31 |
+
|
32 |
+
# Convert to 16kHz sample rate
|
33 |
+
audio = audio.set_frame_rate(16000)
|
34 |
+
|
35 |
+
# Convert to numpy array
|
36 |
+
samples = np.array(audio.get_array_of_samples()).astype(np.float32) / 32768.0
|
37 |
|
38 |
+
# Convert to the format expected by Whisper
|
39 |
+
inputs = {"array": samples, "sampling_rate": 16000}
|
40 |
|
41 |
+
result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
|
42 |
+
|
43 |
+
output = ""
|
44 |
+
for chunk in result["chunks"]:
|
45 |
+
start_time = chunk["timestamp"][0]
|
46 |
+
end_time = chunk["timestamp"][1]
|
47 |
+
text = chunk["text"]
|
48 |
+
output += f"[{format_timestamp(start_time)} -> {format_timestamp(end_time)}] {text}\n"
|
49 |
|
50 |
+
return output
|
51 |
+
|
52 |
+
except Exception as e:
|
53 |
+
raise gr.Error(f"Error processing audio file: {str(e)}")
|
54 |
|
55 |
def format_timestamp(seconds):
|
56 |
minutes, seconds = divmod(seconds, 60)
|