Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -10,51 +10,31 @@ logging.basicConfig(level=logging.INFO)
|
|
10 |
model_path = "./"
|
11 |
model = AutoModelForAudioClassification.from_pretrained(model_path)
|
12 |
|
13 |
-
def preprocess_audio(audio_path, sr=
|
|
|
14 |
audio, sr = librosa.load(audio_path, sr=sr)
|
|
|
15 |
audio, _ = librosa.effects.trim(audio)
|
16 |
return audio, sr
|
17 |
|
18 |
-
def
|
19 |
-
|
20 |
-
num_patches_time = (S_DB.shape[1] - patch_overlap) // stride
|
21 |
-
num_patches_freq = (S_DB.shape[0] - patch_overlap) // stride
|
22 |
-
|
23 |
-
patches = []
|
24 |
-
for i in range(0, num_patches_freq * stride, stride):
|
25 |
-
for j in range(0, num_patches_time * stride, stride):
|
26 |
-
patch = S_DB[i:i+patch_size, j:j+patch_size]
|
27 |
-
if patch.shape == (patch_size, patch_size):
|
28 |
-
patches.append(patch.reshape(-1))
|
29 |
-
return np.stack(patches) if patches else np.empty((0, patch_size*patch_size))
|
30 |
-
|
31 |
-
def extract_features(audio, sr):
|
32 |
S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, hop_length=512, n_fft=2048)
|
|
|
33 |
S_DB = librosa.power_to_db(S, ref=np.max)
|
34 |
-
|
35 |
-
|
36 |
-
# Assuming each patch is flattened to a vector of size 256 (16*16) and then projected to 768 dimensions
|
37 |
-
# Here we simulate this projection by creating a dummy tensor, in practice, this should be done by a learned linear layer
|
38 |
-
patches_tensor = torch.tensor(patches).float()
|
39 |
-
# Simulate linear projection (e.g., via a fully connected layer) to match the embedding size
|
40 |
-
if patches_tensor.nelement() == 0: # Handle case of no patches
|
41 |
-
patch_embeddings_tensor = torch.empty(0, 768)
|
42 |
-
else:
|
43 |
-
patch_embeddings_tensor = patches_tensor # This is a placeholder, replace with actual projection
|
44 |
-
|
45 |
-
return patch_embeddings_tensor.unsqueeze(0) # Add batch dimension for compatibility with model
|
46 |
|
47 |
def predict_voice(audio_file_path):
|
48 |
try:
|
49 |
audio, sr = preprocess_audio(audio_file_path)
|
50 |
-
|
51 |
|
52 |
-
#
|
53 |
-
|
54 |
-
# features = adjust_features_shape(features, expected_shape)
|
55 |
|
56 |
with torch.no_grad():
|
57 |
-
outputs = model(
|
58 |
logits = outputs.logits
|
59 |
predicted_index = logits.argmax()
|
60 |
label = model.config.id2label[predicted_index.item()]
|
|
|
10 |
model_path = "./"
|
11 |
model = AutoModelForAudioClassification.from_pretrained(model_path)
|
12 |
|
13 |
+
def preprocess_audio(audio_path, sr=16000):
|
14 |
+
# Load the audio file. Note: Adjusting the sample rate (sr) to match the model's expected input
|
15 |
audio, sr = librosa.load(audio_path, sr=sr)
|
16 |
+
# Trim silence from the beginning and the end
|
17 |
audio, _ = librosa.effects.trim(audio)
|
18 |
return audio, sr
|
19 |
|
20 |
+
def extract_features(audio, sr=16000):
|
21 |
+
# Compute the Mel spectrogram
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, hop_length=512, n_fft=2048)
|
23 |
+
# Convert to dB scale
|
24 |
S_DB = librosa.power_to_db(S, ref=np.max)
|
25 |
+
# Normally, further feature extraction steps would be here. For this model, we will directly use S_DB.
|
26 |
+
return S_DB
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
def predict_voice(audio_file_path):
|
29 |
try:
|
30 |
audio, sr = preprocess_audio(audio_file_path)
|
31 |
+
S_DB = extract_features(audio, sr)
|
32 |
|
33 |
+
# Convert S_DB to tensor and add required batch dimension
|
34 |
+
S_DB_tensor = torch.tensor(S_DB).unsqueeze(0)
|
|
|
35 |
|
36 |
with torch.no_grad():
|
37 |
+
outputs = model(S_DB_tensor)
|
38 |
logits = outputs.logits
|
39 |
predicted_index = logits.argmax()
|
40 |
label = model.config.id2label[predicted_index.item()]
|