Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,57 +1,44 @@
|
|
1 |
import gradio as gr
|
2 |
-
import spaces
|
3 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
4 |
import torch
|
5 |
import os
|
6 |
import soundfile as sf
|
7 |
from scipy.signal import resample
|
8 |
|
9 |
-
print(f"Is CUDA available: {torch.cuda.is_available()}")
|
10 |
-
if torch.cuda.is_available():
|
11 |
-
print(f"Using CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
|
12 |
-
else:
|
13 |
-
print("No GPU detected, defaulting to CPU.")
|
14 |
-
|
15 |
# Define the model ID
|
16 |
MODEL_ID = "WMRNORDIC/whisper-swedish-telephonic"
|
17 |
|
18 |
-
# Load token from environment
|
19 |
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
|
20 |
if not HF_API_TOKEN:
|
21 |
-
raise ValueError("HF_API_TOKEN
|
22 |
-
|
23 |
-
# GPU Initialization
|
24 |
-
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
25 |
-
print(f"Running on device: {DEVICE}")
|
26 |
|
27 |
-
#
|
28 |
-
@spaces.GPU
|
29 |
def initialize_model():
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
except Exception as e:
|
38 |
-
print(f"Error loading model or processor: {e}")
|
39 |
-
raise e
|
40 |
-
|
41 |
-
processor, model = initialize_model()
|
42 |
|
43 |
# Function to resample audio to 16kHz
|
44 |
def resample_audio(audio_data, original_rate, target_rate=16000):
|
45 |
if original_rate != target_rate:
|
46 |
-
print(f"Resampling audio from {original_rate}Hz to {target_rate}Hz...")
|
47 |
num_samples = int(len(audio_data) * target_rate / original_rate)
|
48 |
return resample(audio_data, num_samples)
|
49 |
return audio_data
|
50 |
|
51 |
-
# Transcription function
|
52 |
-
@spaces.GPU
|
53 |
def transcribe_audio(audio):
|
54 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
if isinstance(audio, tuple): # Microphone input
|
56 |
audio_data = audio[1]
|
57 |
sample_rate = audio[0]
|
@@ -61,8 +48,8 @@ def transcribe_audio(audio):
|
|
61 |
audio_data = resample_audio(audio_data, sample_rate)
|
62 |
|
63 |
# Preprocess and perform inference
|
64 |
-
|
65 |
-
input_features = input_features.to(
|
66 |
with torch.no_grad():
|
67 |
predicted_ids = model.generate(input_features)
|
68 |
|
|
|
1 |
import gradio as gr
|
|
|
2 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
3 |
import torch
|
4 |
import os
|
5 |
import soundfile as sf
|
6 |
from scipy.signal import resample
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
# Define the model ID
|
9 |
MODEL_ID = "WMRNORDIC/whisper-swedish-telephonic"
|
10 |
|
11 |
+
# Load the Hugging Face token from the environment
|
12 |
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
|
13 |
if not HF_API_TOKEN:
|
14 |
+
raise ValueError("HF_API_TOKEN not found in environment variables. Please set it in the Space settings.")
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
# Function to initialize the model and processor lazily
|
|
|
17 |
def initialize_model():
|
18 |
+
# This function will be executed only when Gradio is processing a request
|
19 |
+
print("Loading model and processor...")
|
20 |
+
processor = WhisperProcessor.from_pretrained(MODEL_ID, token=HF_API_TOKEN)
|
21 |
+
model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, token=HF_API_TOKEN)
|
22 |
+
model = model.to("cuda" if torch.cuda.is_available() else "cpu") # Ensure GPU is used if available
|
23 |
+
print("Model loaded successfully.")
|
24 |
+
return processor, model
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
# Function to resample audio to 16kHz
|
27 |
def resample_audio(audio_data, original_rate, target_rate=16000):
|
28 |
if original_rate != target_rate:
|
|
|
29 |
num_samples = int(len(audio_data) * target_rate / original_rate)
|
30 |
return resample(audio_data, num_samples)
|
31 |
return audio_data
|
32 |
|
33 |
+
# Transcription function
|
|
|
34 |
def transcribe_audio(audio):
|
35 |
try:
|
36 |
+
# Lazy-load the model and processor inside the request handler
|
37 |
+
global processor, model
|
38 |
+
if 'processor' not in globals() or 'model' not in globals():
|
39 |
+
processor, model = initialize_model()
|
40 |
+
|
41 |
+
# Handle microphone input or uploaded file
|
42 |
if isinstance(audio, tuple): # Microphone input
|
43 |
audio_data = audio[1]
|
44 |
sample_rate = audio[0]
|
|
|
48 |
audio_data = resample_audio(audio_data, sample_rate)
|
49 |
|
50 |
# Preprocess and perform inference
|
51 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
52 |
+
input_features = processor(audio_data, return_tensors="pt", sampling_rate=16000).input_features.to(device)
|
53 |
with torch.no_grad():
|
54 |
predicted_ids = model.generate(input_features)
|
55 |
|