File size: 2,649 Bytes
0a26e54
1364a7f
6530ee3
f0dd070
15eca51
 
f0dd070
 
ee91d94
15eca51
 
411539a
09e98e6
15eca51
 
411539a
15eca51
09e98e6
0c35856
15eca51
09e98e6
15eca51
09e98e6
15eca51
 
 
 
 
 
 
411539a
15eca51
09e98e6
15eca51
09e98e6
15eca51
 
411539a
15eca51
09e98e6
15eca51
09e98e6
411539a
15eca51
 
 
 
09e98e6
 
15eca51
 
 
 
 
 
 
 
 
 
 
 
e8e81bf
15eca51
 
ee91d94
15eca51
 
 
 
 
 
 
 
ee91d94
09e98e6
15eca51
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import librosa
import numpy as np
import torch
import logging
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import gradio as gr

logging.basicConfig(level=logging.INFO)

# Path to your Wav2Vec2 model and processor
model_path = "./wav2vec2-sequence-classification"
try:
    model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path)
    processor = Wav2Vec2Processor.from_pretrained(model_path)
    logging.info("Model and processor loaded successfully.")
except Exception as e:
    logging.error(f"Loading model and processor failed: {e}")
    raise e

def preprocess_audio(file_path):
    """
    Load and preprocess the audio file.
    """
    # Load the audio file using librosa
    audio, sr = librosa.load(file_path, sr=None)
    # Resample the audio to 16 kHz (if not already at this sample rate)
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        sr = 16000
    return audio, sr

def audio_to_features(audio, sr):
    """
    Convert audio waveform to model features.
    """
    # Use the processor to prepare the features for the model
    return processor(audio, sampling_rate=sr, return_tensors="pt", padding=True, truncation=True).input_values

def classify_audio(file_path):
    """
    Classify the content of the audio file.
    """
    try:
        audio, sr = preprocess_audio(file_path)
        input_values = audio_to_features(audio, sr)
        
        # Inference
        with torch.no_grad():
            logits = model(input_values).logits
        
        # Post-processing: Convert logits to softmax to get probabilities
        probabilities = torch.softmax(logits, dim=1).detach().numpy()
        
        # Assuming you have a binary classification model for simplicity
        # Modify this part based on your actual number of classes and labels
        labels = ['Class 0', 'Class 1']  # Example labels
        predictions = dict(zip(labels, probabilities[0]))
        
        # Format the prediction output
        prediction_output = "\n".join([f"{label}: {prob:.4f}" for label, prob in predictions.items()])
        return prediction_output
    except Exception as e:
        logging.error(f"Error during classification: {e}")
        return f"Classification error: {e}"

# Gradio interface
iface = gr.Interface(
    fn=classify_audio,
    inputs=gr.inputs.Audio(source="upload", type="filepath"),
    outputs="text",
    title="Audio Classification with Wav2Vec2",
    description="Upload an audio file to classify its content using a Wav2Vec2 model."
)

# Launch the interface
if __name__ == "__main__":
    iface.launch()