Kabatubare commited on
Commit
323b26a
·
verified ·
1 Parent(s): 94d6357

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -34
app.py CHANGED
@@ -5,63 +5,47 @@ import torch
5
  import logging
6
  import soundfile as sf
7
  from transformers import AutoModelForAudioClassification
8
- from librosa.effects import pitch_shift, time_stretch
9
 
10
- # Enhanced logging for detailed debugging and information
11
  logging.basicConfig(level=logging.INFO)
12
 
13
- # Load your model (Ensure the model path is correct)
14
  model_path = "./"
15
  model = AutoModelForAudioClassification.from_pretrained(model_path)
16
 
17
- def augment_and_extract_features(audio_path, sr=16000, n_mfcc=40, pitch_shift_steps=4, stretch_rate=1.2):
18
- """
19
- Load, augment, and extract comprehensive features from an audio file.
20
- Features: MFCC, Chroma, Mel Spectrogram, Spectral Contrast, Tonnetz.
21
- Augmentations: Pitch Shift and Time Stretch for nuanced variability.
22
- """
23
- # Load audio
24
  y, sr = librosa.load(audio_path, sr=sr)
 
 
25
 
26
- # Audio Augmentation
27
- y_pitched = pitch_shift(y, sr=sr, n_steps=pitch_shift_steps) # Apply pitch shift
28
- y_stretched = time_stretch(y_pitched, rate=stretch_rate) # Apply time stretch
 
 
29
 
30
- # Feature Extraction
31
- mfcc = librosa.feature.mfcc(y=y_stretched, sr=sr, n_mfcc=n_mfcc)
32
- chroma = librosa.feature.chroma_stft(y=y_stretched, sr=sr)
33
- mel = librosa.feature.melspectrogram(y=y_stretched, sr=sr)
34
- contrast = librosa.feature.spectral_contrast(y=y_stretched, sr=sr)
35
- tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y_stretched), sr=sr)
36
-
37
- # Aggregate features
38
- features = np.concatenate([mfcc, chroma, mel, contrast, tonnetz], axis=0)
39
- features_normalized = StandardScaler().fit_transform(features.T).T # Normalize
40
- features_tensor = torch.tensor(features_normalized).float().unsqueeze(0) # Tensor conversion
41
 
 
42
  return features_tensor
43
 
44
  def predict_voice(audio_file_path):
45
- """
46
- Make a voice authenticity prediction based on augmented and extracted audio features.
47
- """
48
  try:
49
  features_tensor = augment_and_extract_features(audio_file_path)
50
  with torch.no_grad():
51
  outputs = model(features_tensor)
 
52
  logits = outputs.logits
53
  predicted_index = logits.argmax()
54
  label = model.config.id2label[predicted_index.item()]
55
  confidence = torch.softmax(logits, dim=1).max().item() * 100
56
- result = f"Prediction: {label} with {confidence:.2f}% confidence."
57
- logging.info(result)
58
- return result
59
  except Exception as e:
60
- error_msg = f"Error during processing: {e}"
61
- logging.error(error_msg)
62
- return error_msg
63
 
64
- # Gradio interface for easy and interactive model testing
65
  iface = gr.Interface(
66
  fn=predict_voice,
67
  inputs=gr.Audio(label="Upload Audio File", type="filepath"),
 
5
  import logging
6
  import soundfile as sf
7
  from transformers import AutoModelForAudioClassification
 
8
 
 
9
  logging.basicConfig(level=logging.INFO)
10
 
 
11
  model_path = "./"
12
  model = AutoModelForAudioClassification.from_pretrained(model_path)
13
 
14
+ def augment_and_extract_features(audio_path, sr=16000, n_mfcc=40, n_fft=2048, hop_length=512):
 
 
 
 
 
 
15
  y, sr = librosa.load(audio_path, sr=sr)
16
+ y_augmented = librosa.effects.pitch_shift(y, sr=sr, n_steps=4)
17
+ y_augmented = librosa.effects.time_stretch(y_augmented, rate=1.2)
18
 
19
+ mfcc = librosa.feature.mfcc(y=y_augmented, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
20
+ chroma = librosa.feature.chroma_stft(y=y_augmented, sr=sr, n_fft=n_fft, hop_length=hop_length)
21
+ mel = librosa.feature.melspectrogram(y=y_augmented, sr=sr, n_fft=n_fft, hop_length=hop_length)
22
+ contrast = librosa.feature.spectral_contrast(y=y_augmented, sr=sr, n_fft=n_fft, hop_length=hop_length)
23
+ tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y_augmented), sr=sr)
24
 
25
+ features = np.concatenate((mfcc, chroma, mel, contrast, tonnetz), axis=0)
26
+ features = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True)
 
 
 
 
 
 
 
 
 
27
 
28
+ features_tensor = torch.tensor(features).float().unsqueeze(0)
29
  return features_tensor
30
 
31
  def predict_voice(audio_file_path):
 
 
 
32
  try:
33
  features_tensor = augment_and_extract_features(audio_file_path)
34
  with torch.no_grad():
35
  outputs = model(features_tensor)
36
+
37
  logits = outputs.logits
38
  predicted_index = logits.argmax()
39
  label = model.config.id2label[predicted_index.item()]
40
  confidence = torch.softmax(logits, dim=1).max().item() * 100
41
+
42
+ result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
43
+ logging.info("Prediction successful.")
44
  except Exception as e:
45
+ result = f"Error during processing: {e}"
46
+ logging.error(result)
47
+ return result
48
 
 
49
  iface = gr.Interface(
50
  fn=predict_voice,
51
  inputs=gr.Audio(label="Upload Audio File", type="filepath"),