Spaces:
Running
Running
import streamlit as st | |
import librosa | |
import numpy as np | |
import onnxruntime as ort | |
# Audio padding function | |
def pad(x, max_len=64600): | |
""" | |
Pad or trim an audio segment to a fixed length by repeating or slicing. | |
""" | |
x_len = x.shape[0] | |
if x_len >= max_len: | |
return x[:max_len] # Trim if longer | |
# Repeat to fill max_len | |
num_repeats = (max_len // x_len) + 1 | |
padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0] | |
return padded_x | |
# Preprocess audio for a single segment | |
def preprocess_audio_segment(segment, cut=64600): | |
""" | |
Preprocess a single audio segment: pad or trim as required. | |
""" | |
segment = pad(segment, max_len=cut) | |
return np.expand_dims(np.array(segment, dtype=np.float32), axis=0) # Add batch dimension | |
# Sliding window prediction function | |
def predict_with_sliding_window(audio_path, onnx_model_url, window_size=64600, step_size=64600, sample_rate=16000): | |
""" | |
Use a sliding window to predict if the audio is real or fake over the entire audio. | |
""" | |
# Load ONNX runtime session | |
ort_session = ort.InferenceSession(onnx_model_url) | |
# Load audio file | |
waveform, _ = librosa.load(audio_path, sr=sample_rate) | |
total_segments = [] | |
total_probabilities = [] | |
# Sliding window processing | |
for start in range(0, len(waveform), step_size): | |
end = start + window_size | |
segment = waveform[start:end] | |
# Preprocess the segment | |
audio_tensor = preprocess_audio_segment(segment) | |
# Perform inference | |
inputs = {ort_session.get_inputs()[0].name: audio_tensor} | |
outputs = ort_session.run(None, inputs) | |
probabilities = np.exp(outputs[0]) # Softmax probabilities | |
prediction = np.argmax(probabilities) | |
# Store the results | |
predicted_class = "Real" if prediction == 1 else "Fake" | |
total_segments.append(predicted_class) | |
total_probabilities.append(probabilities[0][prediction]) | |
# Final aggregation | |
majority_class = max(set(total_segments), key=total_segments.count) # Majority voting | |
avg_probability = np.mean(total_probabilities) * 100 # Average probability in percentage | |
return majority_class, avg_probability | |
# Streamlit app | |
st.title("Audio Spoof Detection with ONNX Model") | |
st.write("Upload an audio file to detect if it is Real or Fake.") | |
# File uploader | |
uploaded_file = st.file_uploader("Upload your audio file (WAV or MP3)", type=["wav", "mp3"]) | |
if uploaded_file is not None: | |
# Path to your ONNX model | |
onnx_model_url = "https://huggingface.co/Mrkomiljon/DeepVoiceGuard/blob/main/RawNet_model.onnx" | |
# Save uploaded file temporarily | |
with open("temp_audio_file.wav", "wb") as f: | |
f.write(uploaded_file.read()) | |
# Perform prediction | |
with st.spinner("Processing..."): | |
result, avg_probability = predict_with_sliding_window("temp_audio_file.wav", onnx_model_url) | |
# Display results | |
st.success(f"Prediction: {result}") | |
st.info(f"Confidence: {avg_probability:.2f}%") | |
# Clean up temporary file | |
import os | |
os.remove("temp_audio_file.wav") | |