Kabatubare commited on
Commit
94d6357
·
verified ·
1 Parent(s): e02dec8
Files changed (1) hide show
  1. app.py +39 -36
app.py CHANGED
@@ -3,63 +3,65 @@ import librosa
3
  import numpy as np
4
  import torch
5
  import logging
6
- from transformers import AutoModelForAudioClassification
7
  import soundfile as sf
 
 
8
 
9
- # Configure logging for debugging and information
10
  logging.basicConfig(level=logging.INFO)
11
 
12
- # Load the model
13
  model_path = "./"
14
  model = AutoModelForAudioClassification.from_pretrained(model_path)
15
 
16
- def augment_and_extract_features(audio_path, output_path=None, sr=16000, n_mfcc=40, n_fft=2048, hop_length=512):
17
- # Load and augment the audio file
 
 
 
 
 
18
  y, sr = librosa.load(audio_path, sr=sr)
19
- y_augmented = librosa.effects.pitch_shift(y, sr, n_steps=4) # Pitch shifting
20
- y_augmented = librosa.effects.time_stretch(y_augmented, rate=1.2) # Time stretching
21
-
22
- # Save the augmented audio if an output path is provided
23
- if output_path is not None:
24
- sf.write(output_path, y_augmented, sr)
25
-
26
- # Extract features
27
- mfcc = librosa.feature.mfcc(y=y_augmented, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
28
- chroma = librosa.feature.chroma_stft(y=y_augmented, sr=sr, n_fft=n_fft, hop_length=hop_length)
29
- mel = librosa.feature.melspectrogram(y=y_augmented, sr=sr, n_fft=n_fft, hop_length=hop_length)
30
- contrast = librosa.feature.spectral_contrast(y=y_augmented, sr=sr, n_fft=n_fft, hop_length=hop_length)
31
- tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y_augmented), sr=sr)
32
-
33
- # Combine all features
34
- features = np.concatenate((mfcc, chroma, mel, contrast, tonnetz), axis=0)
35
-
36
- # Normalize features
37
- features = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True)
38
-
39
- # Convert to tensor
40
- features_tensor = torch.tensor(features).float().unsqueeze(0) # Add batch dimension
41
  return features_tensor
42
 
43
  def predict_voice(audio_file_path):
 
 
 
44
  try:
45
  features_tensor = augment_and_extract_features(audio_file_path)
46
-
47
  with torch.no_grad():
48
  outputs = model(features_tensor)
49
-
50
  logits = outputs.logits
51
  predicted_index = logits.argmax()
52
  label = model.config.id2label[predicted_index.item()]
53
  confidence = torch.softmax(logits, dim=1).max().item() * 100
54
-
55
- result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
56
- logging.info("Prediction successful.")
57
  except Exception as e:
58
- result = f"Error during processing: {e}"
59
- logging.error(result)
60
-
61
- return result
62
 
 
63
  iface = gr.Interface(
64
  fn=predict_voice,
65
  inputs=gr.Audio(label="Upload Audio File", type="filepath"),
@@ -69,3 +71,4 @@ iface = gr.Interface(
69
  )
70
 
71
  iface.launch()
 
 
3
  import numpy as np
4
  import torch
5
  import logging
 
6
  import soundfile as sf
7
+ from transformers import AutoModelForAudioClassification
8
+ from librosa.effects import pitch_shift, time_stretch
9
 
10
+ # Enhanced logging for detailed debugging and information
11
  logging.basicConfig(level=logging.INFO)
12
 
13
+ # Load your model (Ensure the model path is correct)
14
  model_path = "./"
15
  model = AutoModelForAudioClassification.from_pretrained(model_path)
16
 
17
+ def augment_and_extract_features(audio_path, sr=16000, n_mfcc=40, pitch_shift_steps=4, stretch_rate=1.2):
18
+ """
19
+ Load, augment, and extract comprehensive features from an audio file.
20
+ Features: MFCC, Chroma, Mel Spectrogram, Spectral Contrast, Tonnetz.
21
+ Augmentations: Pitch Shift and Time Stretch for nuanced variability.
22
+ """
23
+ # Load audio
24
  y, sr = librosa.load(audio_path, sr=sr)
25
+
26
+ # Audio Augmentation
27
+ y_pitched = pitch_shift(y, sr=sr, n_steps=pitch_shift_steps) # Apply pitch shift
28
+ y_stretched = time_stretch(y_pitched, rate=stretch_rate) # Apply time stretch
29
+
30
+ # Feature Extraction
31
+ mfcc = librosa.feature.mfcc(y=y_stretched, sr=sr, n_mfcc=n_mfcc)
32
+ chroma = librosa.feature.chroma_stft(y=y_stretched, sr=sr)
33
+ mel = librosa.feature.melspectrogram(y=y_stretched, sr=sr)
34
+ contrast = librosa.feature.spectral_contrast(y=y_stretched, sr=sr)
35
+ tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y_stretched), sr=sr)
36
+
37
+ # Aggregate features
38
+ features = np.concatenate([mfcc, chroma, mel, contrast, tonnetz], axis=0)
39
+ features_normalized = StandardScaler().fit_transform(features.T).T # Normalize
40
+ features_tensor = torch.tensor(features_normalized).float().unsqueeze(0) # Tensor conversion
41
+
 
 
 
 
 
42
  return features_tensor
43
 
44
  def predict_voice(audio_file_path):
45
+ """
46
+ Make a voice authenticity prediction based on augmented and extracted audio features.
47
+ """
48
  try:
49
  features_tensor = augment_and_extract_features(audio_file_path)
 
50
  with torch.no_grad():
51
  outputs = model(features_tensor)
 
52
  logits = outputs.logits
53
  predicted_index = logits.argmax()
54
  label = model.config.id2label[predicted_index.item()]
55
  confidence = torch.softmax(logits, dim=1).max().item() * 100
56
+ result = f"Prediction: {label} with {confidence:.2f}% confidence."
57
+ logging.info(result)
58
+ return result
59
  except Exception as e:
60
+ error_msg = f"Error during processing: {e}"
61
+ logging.error(error_msg)
62
+ return error_msg
 
63
 
64
+ # Gradio interface for easy and interactive model testing
65
  iface = gr.Interface(
66
  fn=predict_voice,
67
  inputs=gr.Audio(label="Upload Audio File", type="filepath"),
 
71
  )
72
 
73
  iface.launch()
74
+