Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -3,41 +3,39 @@ import librosa
|
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
import logging
|
6 |
-
from transformers import AutoModelForAudioClassification
|
7 |
-
import soundfile as sf
|
8 |
-
from scipy.signal import butter, lfilter
|
9 |
|
10 |
logging.basicConfig(level=logging.INFO)
|
11 |
|
12 |
model_path = "./"
|
13 |
model = AutoModelForAudioClassification.from_pretrained(model_path)
|
14 |
-
processor = Wav2Vec2Processor.from_pretrained(model_path)
|
15 |
|
16 |
-
def
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
b, a = butter(order, [low, high], btype='band')
|
21 |
-
return b, a
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
y_filtered = butter_bandpass_filter(y, 300, 3400, sr)
|
31 |
-
y_quantum = np.fft.fft(y_filtered)
|
32 |
-
y_ifft = np.fft.ifft(y_quantum).real
|
33 |
|
34 |
-
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
|
|
|
37 |
|
38 |
def predict_voice(audio_file_path):
|
39 |
try:
|
40 |
-
features_tensor =
|
41 |
with torch.no_grad():
|
42 |
outputs = model(features_tensor)
|
43 |
|
@@ -62,4 +60,3 @@ iface = gr.Interface(
|
|
62 |
)
|
63 |
|
64 |
iface.launch()
|
65 |
-
|
|
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
import logging
|
6 |
+
from transformers import AutoModelForAudioClassification
|
|
|
|
|
7 |
|
8 |
logging.basicConfig(level=logging.INFO)
|
9 |
|
10 |
model_path = "./"
|
11 |
model = AutoModelForAudioClassification.from_pretrained(model_path)
|
|
|
12 |
|
13 |
+
def augment_and_extract_features(audio_path, sr=16000, n_mfcc=40, n_fft=2048, hop_length=512, target_length=512):
|
14 |
+
y, sr = librosa.load(audio_path, sr=sr)
|
15 |
+
y_pitch_shifted = librosa.effects.pitch_shift(y, sr=sr, n_steps=4)
|
16 |
+
y_time_stretched = librosa.effects.time_stretch(y_pitch_shifted, rate=1.2)
|
|
|
|
|
17 |
|
18 |
+
mfcc = librosa.feature.mfcc(y=y_time_stretched, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
|
19 |
+
chroma = librosa.feature.chroma_stft(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length)
|
20 |
+
mel = librosa.feature.melspectrogram(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length)
|
21 |
+
contrast = librosa.feature.spectral_contrast(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length)
|
22 |
+
tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y_time_stretched), sr=sr)
|
23 |
|
24 |
+
features = np.concatenate((mfcc, chroma, mel, contrast, tonnetz), axis=0)
|
25 |
+
features_normalized = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True)
|
|
|
|
|
|
|
26 |
|
27 |
+
if features_normalized.shape[1] > target_length:
|
28 |
+
features_normalized = features_normalized[:, :target_length]
|
29 |
+
else:
|
30 |
+
padding = target_length - features_normalized.shape[1]
|
31 |
+
features_normalized = np.pad(features_normalized, ((0, 0), (0, padding)), 'constant')
|
32 |
|
33 |
+
features_tensor = torch.tensor(features_normalized).float().unsqueeze(0) # Add batch dimension
|
34 |
+
return features_tensor
|
35 |
|
36 |
def predict_voice(audio_file_path):
|
37 |
try:
|
38 |
+
features_tensor = augment_and_extract_features(audio_file_path)
|
39 |
with torch.no_grad():
|
40 |
outputs = model(features_tensor)
|
41 |
|
|
|
60 |
)
|
61 |
|
62 |
iface.launch()
|
|