Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,42 +1,42 @@
|
|
1 |
-
import
|
|
|
2 |
import librosa
|
3 |
import numpy as np
|
4 |
-
import
|
5 |
-
import logging
|
6 |
from transformers import AutoModelForAudioClassification
|
|
|
7 |
|
8 |
logging.basicConfig(level=logging.INFO)
|
9 |
|
|
|
10 |
model_path = "./"
|
11 |
model = AutoModelForAudioClassification.from_pretrained(model_path)
|
12 |
|
13 |
def preprocess_audio(audio_path, sr=22050):
|
14 |
-
# Load and trim the audio file
|
15 |
audio, sr = librosa.load(audio_path, sr=sr)
|
16 |
audio, _ = librosa.effects.trim(audio)
|
17 |
return audio, sr
|
18 |
|
19 |
def extract_features(audio, sr):
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(audio), sr=sr)
|
27 |
|
28 |
-
#
|
29 |
-
|
30 |
-
|
31 |
|
32 |
-
|
|
|
33 |
|
34 |
def predict_voice(audio_file_path):
|
35 |
try:
|
36 |
audio, sr = preprocess_audio(audio_file_path)
|
37 |
features = extract_features(audio, sr)
|
38 |
|
39 |
-
# Model prediction
|
40 |
with torch.no_grad():
|
41 |
outputs = model(features)
|
42 |
logits = outputs.logits
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
import librosa
|
4 |
import numpy as np
|
5 |
+
import gradio as gr
|
|
|
6 |
from transformers import AutoModelForAudioClassification
|
7 |
+
import logging
|
8 |
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
|
11 |
+
# Load your model here
|
12 |
model_path = "./"
|
13 |
model = AutoModelForAudioClassification.from_pretrained(model_path)
|
14 |
|
15 |
def preprocess_audio(audio_path, sr=22050):
|
|
|
16 |
audio, sr = librosa.load(audio_path, sr=sr)
|
17 |
audio, _ = librosa.effects.trim(audio)
|
18 |
return audio, sr
|
19 |
|
20 |
def extract_features(audio, sr):
|
21 |
+
S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, hop_length=512, n_fft=2048)
|
22 |
+
S_DB = librosa.power_to_db(S, ref=np.max)
|
23 |
+
|
24 |
+
# Reshape the spectrogram to a sequence of overlapping 16x16 patches
|
25 |
+
patches = librosa.util.frame(S_DB.flatten(), frame_length=16*16, hop_length=(16-6)*(16-6)).T
|
26 |
+
patches = patches.reshape(patches.shape[0], 16, 16)
|
|
|
27 |
|
28 |
+
# Linear projection layer equivalent (patch embedding layer)
|
29 |
+
patch_embeddings = patches.reshape(patches.shape[0], -1)
|
30 |
+
patch_embeddings = torch.tensor(patch_embeddings).float()
|
31 |
|
32 |
+
# Assuming positional embeddings and [CLS] token embedding are handled within the model
|
33 |
+
return patch_embeddings.unsqueeze(0) # Add batch dimension for compatibility with model
|
34 |
|
35 |
def predict_voice(audio_file_path):
|
36 |
try:
|
37 |
audio, sr = preprocess_audio(audio_file_path)
|
38 |
features = extract_features(audio, sr)
|
39 |
|
|
|
40 |
with torch.no_grad():
|
41 |
outputs = model(features)
|
42 |
logits = outputs.logits
|