import streamlit as st import librosa import numpy as np import onnxruntime as ort # Audio padding function def pad(x, max_len=64600): """ Pad or trim an audio segment to a fixed length by repeating or slicing. """ x_len = x.shape[0] if x_len >= max_len: return x[:max_len] # Trim if longer # Repeat to fill max_len num_repeats = (max_len // x_len) + 1 padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0] return padded_x # Preprocess audio for a single segment def preprocess_audio_segment(segment, cut=64600): """ Preprocess a single audio segment: pad or trim as required. """ segment = pad(segment, max_len=cut) return np.expand_dims(np.array(segment, dtype=np.float32), axis=0) # Add batch dimension # Sliding window prediction function def predict_with_sliding_window(audio_path, onnx_model_url, window_size=64600, step_size=64600, sample_rate=16000): """ Use a sliding window to predict if the audio is real or fake over the entire audio. """ # Load ONNX runtime session ort_session = ort.InferenceSession(onnx_model_url) # Load audio file waveform, _ = librosa.load(audio_path, sr=sample_rate) total_segments = [] total_probabilities = [] # Sliding window processing for start in range(0, len(waveform), step_size): end = start + window_size segment = waveform[start:end] # Preprocess the segment audio_tensor = preprocess_audio_segment(segment) # Perform inference inputs = {ort_session.get_inputs()[0].name: audio_tensor} outputs = ort_session.run(None, inputs) probabilities = np.exp(outputs[0]) # Softmax probabilities prediction = np.argmax(probabilities) # Store the results predicted_class = "Real" if prediction == 1 else "Fake" total_segments.append(predicted_class) total_probabilities.append(probabilities[0][prediction]) # Final aggregation majority_class = max(set(total_segments), key=total_segments.count) # Majority voting avg_probability = np.mean(total_probabilities) * 100 # Average probability in percentage return majority_class, avg_probability # Streamlit app st.title("Audio Spoof Detection with ONNX Model") st.write("Upload an audio file to detect if it is Real or Fake.") # File uploader uploaded_file = st.file_uploader("Upload your audio file (WAV or MP3)", type=["wav", "mp3"]) if uploaded_file is not None: # Path to your ONNX model onnx_model_url = "https://huggingface.co/Mrkomiljon/DeepVoiceGuard/resolve/main/RawNet_model.onnx" # Save uploaded file temporarily with open("temp_audio_file.wav", "wb") as f: f.write(uploaded_file.read()) # Perform prediction with st.spinner("Processing..."): result, avg_probability = predict_with_sliding_window("temp_audio_file.wav", onnx_model_url) # Display results st.success(f"Prediction: {result}") st.info(f"Confidence: {avg_probability:.2f}%") # Clean up temporary file import os os.remove("temp_audio_file.wav")