Kabatubare commited on
Commit
14ac9f5
·
verified ·
1 Parent(s): 805e4a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -32
app.py CHANGED
@@ -10,51 +10,31 @@ logging.basicConfig(level=logging.INFO)
10
  model_path = "./"
11
  model = AutoModelForAudioClassification.from_pretrained(model_path)
12
 
13
- def preprocess_audio(audio_path, sr=22050):
 
14
  audio, sr = librosa.load(audio_path, sr=sr)
 
15
  audio, _ = librosa.effects.trim(audio)
16
  return audio, sr
17
 
18
- def extract_patches(S_DB, patch_size=16, patch_overlap=6):
19
- stride = patch_size - patch_overlap
20
- num_patches_time = (S_DB.shape[1] - patch_overlap) // stride
21
- num_patches_freq = (S_DB.shape[0] - patch_overlap) // stride
22
-
23
- patches = []
24
- for i in range(0, num_patches_freq * stride, stride):
25
- for j in range(0, num_patches_time * stride, stride):
26
- patch = S_DB[i:i+patch_size, j:j+patch_size]
27
- if patch.shape == (patch_size, patch_size):
28
- patches.append(patch.reshape(-1))
29
- return np.stack(patches) if patches else np.empty((0, patch_size*patch_size))
30
-
31
- def extract_features(audio, sr):
32
  S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, hop_length=512, n_fft=2048)
 
33
  S_DB = librosa.power_to_db(S, ref=np.max)
34
- patches = extract_patches(S_DB)
35
-
36
- # Assuming each patch is flattened to a vector of size 256 (16*16) and then projected to 768 dimensions
37
- # Here we simulate this projection by creating a dummy tensor, in practice, this should be done by a learned linear layer
38
- patches_tensor = torch.tensor(patches).float()
39
- # Simulate linear projection (e.g., via a fully connected layer) to match the embedding size
40
- if patches_tensor.nelement() == 0: # Handle case of no patches
41
- patch_embeddings_tensor = torch.empty(0, 768)
42
- else:
43
- patch_embeddings_tensor = patches_tensor # This is a placeholder, replace with actual projection
44
-
45
- return patch_embeddings_tensor.unsqueeze(0) # Add batch dimension for compatibility with model
46
 
47
  def predict_voice(audio_file_path):
48
  try:
49
  audio, sr = preprocess_audio(audio_file_path)
50
- features = extract_features(audio, sr)
51
 
52
- # Adjust the features size to match the model input, if necessary
53
- # Example: Reshape or pad the features tensor
54
- # features = adjust_features_shape(features, expected_shape)
55
 
56
  with torch.no_grad():
57
- outputs = model(features)
58
  logits = outputs.logits
59
  predicted_index = logits.argmax()
60
  label = model.config.id2label[predicted_index.item()]
 
10
  model_path = "./"
11
  model = AutoModelForAudioClassification.from_pretrained(model_path)
12
 
13
+ def preprocess_audio(audio_path, sr=16000):
14
+ # Load the audio file. Note: Adjusting the sample rate (sr) to match the model's expected input
15
  audio, sr = librosa.load(audio_path, sr=sr)
16
+ # Trim silence from the beginning and the end
17
  audio, _ = librosa.effects.trim(audio)
18
  return audio, sr
19
 
20
+ def extract_features(audio, sr=16000):
21
+ # Compute the Mel spectrogram
 
 
 
 
 
 
 
 
 
 
 
 
22
  S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, hop_length=512, n_fft=2048)
23
+ # Convert to dB scale
24
  S_DB = librosa.power_to_db(S, ref=np.max)
25
+ # Normally, further feature extraction steps would be here. For this model, we will directly use S_DB.
26
+ return S_DB
 
 
 
 
 
 
 
 
 
 
27
 
28
  def predict_voice(audio_file_path):
29
  try:
30
  audio, sr = preprocess_audio(audio_file_path)
31
+ S_DB = extract_features(audio, sr)
32
 
33
+ # Convert S_DB to tensor and add required batch dimension
34
+ S_DB_tensor = torch.tensor(S_DB).unsqueeze(0)
 
35
 
36
  with torch.no_grad():
37
+ outputs = model(S_DB_tensor)
38
  logits = outputs.logits
39
  predicted_index = logits.argmax()
40
  label = model.config.id2label[predicted_index.item()]