|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
from speechbrain.pretrained import EncoderClassifier |
|
import numpy as np |
|
from scipy.spatial.distance import cosine |
|
import librosa |
|
import torchaudio |
|
import gradio as gr |
|
import noisereduce as nr |
|
|
|
|
|
from transformers import WavLMForXVector, Wav2Vec2FeatureExtractor |
|
|
|
|
|
def reduce_noise(waveform, sample_rate=16000): |
|
""" |
|
Apply a mild noise reduction to the waveform specialized for voice audio. |
|
The parameters are chosen to minimize alteration to the original voice. |
|
|
|
Parameters: |
|
waveform (torch.Tensor): Audio tensor of shape (1, n_samples) |
|
sample_rate (int): Sampling rate of the audio |
|
|
|
Returns: |
|
torch.Tensor: Denoised audio tensor of shape (1, n_samples) |
|
""" |
|
|
|
waveform_np = waveform.squeeze(0).cpu().numpy() |
|
|
|
reduced_noise = nr.reduce_noise(y=waveform_np, sr=sample_rate, prop_decrease=0.5) |
|
return torch.from_numpy(reduced_noise).unsqueeze(0) |
|
|
|
def remove_long_silence(waveform, sample_rate=16000, top_db=20, max_silence_length=1.0): |
|
""" |
|
Remove silence segments longer than max_silence_length seconds from the audio. |
|
This function uses librosa.effects.split to detect non-silent intervals and |
|
preserves at most max_silence_length seconds of silence between speech segments. |
|
|
|
Parameters: |
|
waveform (torch.Tensor): Audio tensor of shape (1, n_samples) |
|
sample_rate (int): Sampling rate of the audio |
|
top_db (int): The threshold (in decibels) below reference to consider as silence |
|
max_silence_length (float): Maximum allowed silence duration in seconds |
|
|
|
Returns: |
|
torch.Tensor: Processed audio tensor with long silences removed |
|
""" |
|
|
|
waveform_np = waveform.squeeze(0).cpu().numpy() |
|
|
|
non_silent_intervals = librosa.effects.split(waveform_np, top_db=top_db) |
|
if len(non_silent_intervals) == 0: |
|
return waveform |
|
|
|
output_segments = [] |
|
max_silence_samples = int(max_silence_length * sample_rate) |
|
|
|
|
|
if non_silent_intervals[0][0] > 0: |
|
output_segments.append(waveform_np[:min(non_silent_intervals[0][0], max_silence_samples)]) |
|
|
|
|
|
for i, (start, end) in enumerate(non_silent_intervals): |
|
output_segments.append(waveform_np[start:end]) |
|
if i < len(non_silent_intervals) - 1: |
|
next_start = non_silent_intervals[i + 1][0] |
|
gap = next_start - end |
|
if gap > max_silence_samples: |
|
output_segments.append(waveform_np[end:end + max_silence_samples]) |
|
else: |
|
output_segments.append(waveform_np[end:next_start]) |
|
|
|
|
|
if non_silent_intervals[-1][1] < len(waveform_np): |
|
gap = len(waveform_np) - non_silent_intervals[-1][1] |
|
if gap > max_silence_samples: |
|
output_segments.append(waveform_np[-max_silence_samples:]) |
|
else: |
|
output_segments.append(waveform_np[non_silent_intervals[-1][1]:]) |
|
|
|
processed_waveform = np.concatenate(output_segments) |
|
return torch.from_numpy(processed_waveform).unsqueeze(0) |
|
|
|
|
|
class EnhancedECAPATDNN(nn.Module): |
|
def __init__(self): |
|
super().__init__() |
|
|
|
self.ecapa = EncoderClassifier.from_hparams( |
|
source="speechbrain/spkrec-ecapa-voxceleb", |
|
savedir="pretrained_models/spkrec-ecapa-voxceleb", |
|
run_opts={"device": "cuda" if torch.cuda.is_available() else "cpu"} |
|
) |
|
|
|
|
|
self.wavlm_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base-sv") |
|
self.wavlm = WavLMForXVector.from_pretrained("microsoft/wavlm-base-sv") |
|
self.wavlm.to("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
self.wavlm_proj = nn.Linear(512, 192) |
|
|
|
|
|
|
|
self.enhancement = nn.Sequential( |
|
nn.Linear(192, 256), |
|
nn.ReLU(), |
|
nn.Dropout(0.3), |
|
nn.Linear(256, 192) |
|
) |
|
|
|
|
|
self.transformer = nn.TransformerEncoder( |
|
nn.TransformerEncoderLayer(d_model=192, nhead=4, dropout=0.3, batch_first=True), |
|
num_layers=2 |
|
) |
|
|
|
@torch.no_grad() |
|
def forward(self, x): |
|
""" |
|
x: input waveform tensor of shape (1, T) on device. |
|
""" |
|
|
|
emb_ecapa = self.ecapa.encode_batch(x) |
|
|
|
|
|
|
|
waveform_np = x.squeeze(0).cpu().numpy() |
|
wavlm_inputs = self.wavlm_feature_extractor(waveform_np, sampling_rate=16000, return_tensors="pt") |
|
wavlm_inputs = {k: v.to(x.device) for k, v in wavlm_inputs.items()} |
|
wavlm_out = self.wavlm(**wavlm_inputs) |
|
|
|
emb_wavlm = wavlm_out.embeddings |
|
|
|
emb_wavlm_proj = self.wavlm_proj(emb_wavlm) |
|
|
|
|
|
if emb_ecapa.dim() > 2 and emb_ecapa.size(1) > 1: |
|
emb_ecapa_proc = self.transformer(emb_ecapa) |
|
emb_ecapa_proc = emb_ecapa_proc.mean(dim=1) |
|
else: |
|
emb_ecapa_proc = emb_ecapa |
|
|
|
|
|
fused = (emb_ecapa_proc + emb_wavlm_proj) / 2 |
|
|
|
|
|
enhanced = self.enhancement(fused) |
|
output = F.normalize(enhanced, p=2, dim=-1) |
|
return output |
|
|
|
class ForensicSpeakerVerification: |
|
def __init__(self): |
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"Using device: {self.device}") |
|
self.model = EnhancedECAPATDNN().to(self.device) |
|
self.model.eval() |
|
|
|
|
|
trainable_params = list(self.model.enhancement.parameters()) + list(self.model.transformer.parameters()) |
|
self.optimizer = torch.optim.AdamW(trainable_params, lr=1e-4) |
|
self.training_embeddings = [] |
|
|
|
def preprocess_audio(self, file_path, max_duration=10): |
|
try: |
|
waveform, sample_rate = torchaudio.load(file_path) |
|
if waveform.shape[0] > 1: |
|
waveform = torch.mean(waveform, dim=0, keepdim=True) |
|
if sample_rate != 16000: |
|
resampler = torchaudio.transforms.Resample(sample_rate, 16000) |
|
waveform = resampler(waveform) |
|
max_length = int(16000 * max_duration) |
|
if waveform.shape[1] > max_length: |
|
waveform = waveform[:, :max_length] |
|
waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8) |
|
|
|
waveform = reduce_noise(waveform, sample_rate=16000) |
|
|
|
waveform = remove_long_silence(waveform, sample_rate=16000) |
|
return waveform.to(self.device) |
|
except Exception as e: |
|
raise ValueError(f"Error preprocessing audio: {str(e)}") |
|
|
|
@torch.no_grad() |
|
def extract_embedding(self, file_path, chunk_duration=3, overlap=0.5): |
|
waveform = self.preprocess_audio(file_path) |
|
sample_rate = 16000 |
|
chunk_size = int(chunk_duration * sample_rate) |
|
hop_size = int(chunk_size * (1 - overlap)) |
|
embeddings = [] |
|
if waveform.shape[1] > chunk_size: |
|
for start in range(0, waveform.shape[1] - chunk_size + 1, hop_size): |
|
chunk = waveform[:, start:start+chunk_size] |
|
emb = self.model(chunk) |
|
embeddings.append(emb) |
|
final_emb = torch.mean(torch.cat(embeddings, dim=0), dim=0, keepdim=True) |
|
else: |
|
final_emb = self.model(waveform) |
|
return final_emb.cpu().numpy() |
|
|
|
def verify_speaker(self, questioned_audio, suspect_audio, progress=gr.Progress()): |
|
if not questioned_audio or not suspect_audio: |
|
return "⚠️ Please provide both audio samples" |
|
try: |
|
progress(0.2, desc="Processing questioned audio...") |
|
questioned_emb = self.extract_embedding(questioned_audio) |
|
progress(0.4, desc="Processing suspect audio...") |
|
suspect_emb = self.extract_embedding(suspect_audio) |
|
progress(0.6, desc="Computing similarity...") |
|
score = 1 - cosine(questioned_emb.flatten(), suspect_emb.flatten()) |
|
|
|
|
|
probability = score * 100 |
|
|
|
|
|
heat_bar = f""" |
|
<div style="width:100%; height:30px; position:relative; margin-bottom:10px;"> |
|
<div style="width:100%; height:20px; background: linear-gradient(to right, #FF0000, #FFFF00, #00FF00); border-radius:10px;"></div> |
|
<div style="position:absolute; left:{probability}%; top:0; transform:translateX(-50%);"> |
|
<div style="width:0; height:0; border-left:8px solid transparent; border-right:8px solid transparent; border-bottom:10px solid black;"></div> |
|
<div style="width:2px; height:20px; background-color:black; margin-left:7px;"></div> |
|
</div> |
|
</div> |
|
""" |
|
|
|
|
|
if probability <= 50: |
|
color = f"rgb(255, {int(255 * (probability / 50))}, 0)" |
|
else: |
|
color = f"rgb({int(255 * (2 - probability / 50))}, 255, 0)" |
|
|
|
|
|
if score >= 0.6: |
|
verdict_text = '✅ Same Speaker' |
|
else: |
|
verdict_text = '⚠️ Different Speakers' |
|
|
|
result = f""" |
|
<div style='font-family: Arial, sans-serif; font-size: 16px; background-color: #f5f5f5; padding: 20px; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);'> |
|
<h2 style='color: #333; margin-bottom: 20px;'>Speaker Verification Analysis Results</h2> |
|
<p style='margin-bottom: 10px; color: black;'>Similarity Score: <strong style='color:{color};'>{probability:.1f}%</strong></p> |
|
{heat_bar} |
|
<p style='margin-top: 20px; font-size: 18px; font-weight: bold; color: #333;'>{verdict_text}</p> |
|
</div> |
|
""" |
|
progress(1.0) |
|
return result |
|
except Exception as e: |
|
return f"❌ Error during verification: {str(e)}" |
|
|
|
|
|
speaker_verification = ForensicSpeakerVerification() |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
gr.Markdown( |
|
""" |
|
# 🎙️ Forensic Speaker Verification System |
|
Upload or record two audio samples to compare and verify if they belong to the same speaker. |
|
""" |
|
) |
|
|
|
with gr.Column(): |
|
questioned_audio = gr.Audio( |
|
sources=["upload", "microphone"], |
|
type="filepath", |
|
label="Questioned Audio Sample" |
|
) |
|
suspect_audio = gr.Audio( |
|
sources=["upload", "microphone"], |
|
type="filepath", |
|
label="Suspect Audio Sample" |
|
) |
|
test_button = gr.Button("🔍 Compare Speakers", variant="primary") |
|
test_output = gr.HTML() |
|
|
|
test_button.click( |
|
fn=speaker_verification.verify_speaker, |
|
inputs=[questioned_audio, suspect_audio], |
|
outputs=test_output |
|
) |
|
|
|
gr.Markdown( |
|
""" |
|
### How it works |
|
1. Upload or record the questioned audio sample. |
|
2. Upload or record the suspect audio sample. |
|
3. Click "Compare Speakers" to analyze the similarity between the two samples. |
|
4. View the results, including the similarity score and verdict. |
|
|
|
Note: For best results, use clear audio samples with minimal background noise. |
|
""" |
|
) |
|
|
|
|
|
demo.launch(share=True) |