|
import gradio as gr |
|
import librosa |
|
import numpy as np |
|
import pandas as pd |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import torch |
|
from speechbrain.inference.speaker import EncoderClassifier |
|
from sklearn.decomposition import PCA |
|
from sklearn.manifold import TSNE |
|
import plotly.graph_objects as go |
|
from sklearn.preprocessing import normalize |
|
import os |
|
from cryptography.fernet import Fernet |
|
import pickle |
|
|
|
|
|
encrypted_file_path = os.environ.get("SPEAKER_EMBEDDINGS_FILE") |
|
metadata_file = os.environ.get("METADATA_FILE") |
|
visualization_method = os.environ.get("VISUALIZATION_METHOD", "pca") |
|
max_length = 5 * 16000 |
|
num_closest_speakers = 20 |
|
pca_dim = 50 |
|
|
|
|
|
if not encrypted_file_path: |
|
raise ValueError("SPEAKER_EMBEDDINGS_FILE environment variable is not set.") |
|
if not metadata_file: |
|
raise ValueError("METADATA_FILE environment variable is not set.") |
|
|
|
if visualization_method not in ["pca", "tsne"]: |
|
raise ValueError("Invalid VISUALIZATION_METHOD. Choose 'pca' or 'tsne'.") |
|
|
|
|
|
print(f"DECRYPTION_KEY: {os.getenv('DECRYPTION_KEY')}") |
|
print(f"SPEAKER_EMBEDDINGS_FILE: {os.getenv('SPEAKER_EMBEDDINGS_FILE')}") |
|
if os.getenv('SPEAKER_EMBEDDINGS_FILE'): |
|
print( |
|
f"Encrypted file path exists: {os.path.exists(os.getenv('SPEAKER_EMBEDDINGS_FILE'))}" |
|
) |
|
else: |
|
print( |
|
"Encrypted file path does not exist: SPEAKER_EMBEDDINGS_FILE environment variable not set or file not found." |
|
) |
|
|
|
|
|
key = os.getenv("DECRYPTION_KEY") |
|
if not key: |
|
raise ValueError( |
|
"Decryption key is missing. Ensure DECRYPTION_KEY is set in the environment variables." |
|
) |
|
|
|
fernet = Fernet(key.encode("utf-8")) |
|
|
|
|
|
sample_audio_dir = "sample_audio" |
|
sample_audio_files = [ |
|
"Bob_Barker.mp3", |
|
"Howie_Mandel.m4a", |
|
"Katherine_Jenkins.mp3", |
|
] |
|
|
|
|
|
try: |
|
with open(encrypted_file_path, "rb") as encrypted_file: |
|
encrypted_data = encrypted_file.read() |
|
|
|
decrypted_data_bytes = fernet.decrypt(encrypted_data) |
|
|
|
|
|
speaker_embeddings = pickle.loads(decrypted_data_bytes) |
|
|
|
print("Speaker embeddings loaded successfully!") |
|
|
|
except FileNotFoundError: |
|
raise FileNotFoundError( |
|
f"Could not find encrypted embeddings file at: {encrypted_file_path}" |
|
) |
|
except Exception as e: |
|
raise Exception(f"Error during decryption or loading embeddings: {e}") |
|
|
|
df = pd.read_csv(metadata_file, delimiter="\t") |
|
|
|
|
|
for spk_id, embeddings in speaker_embeddings.items(): |
|
speaker_embeddings[spk_id] = [np.array(embedding) for embedding in embeddings] |
|
|
|
|
|
speaker_id_to_name = dict(zip(df["VoxCeleb1 ID"], df["VGGFace1 ID"])) |
|
|
|
|
|
classifier = EncoderClassifier.from_hparams( |
|
source="speechbrain/spkrec-xvect-voxceleb", |
|
savedir="pretrained_models/spkrec-xvect-voxceleb", |
|
) |
|
|
|
|
|
def calculate_average_embedding(embeddings): |
|
avg_embedding = np.mean(embeddings, axis=0) |
|
return normalize(avg_embedding.reshape(1, -1)).flatten() |
|
|
|
|
|
speaker_centroids = { |
|
spk_id: calculate_average_embedding(embeddings) |
|
for spk_id, embeddings in speaker_embeddings.items() |
|
} |
|
|
|
|
|
def prepare_data_for_visualization(speaker_centroids, closest_speaker_ids): |
|
all_embeddings = [ |
|
centroid |
|
for speaker_id, centroid in speaker_centroids.items() |
|
if speaker_id in closest_speaker_ids |
|
] |
|
all_speaker_ids = [ |
|
speaker_id |
|
for speaker_id in speaker_centroids |
|
if speaker_id in closest_speaker_ids |
|
] |
|
return np.array(all_embeddings), np.array(all_speaker_ids) |
|
|
|
|
|
def reduce_dimensionality(all_embeddings, method="tsne", perplexity=5, pca_dim=50): |
|
if method == "pca": |
|
reducer = PCA(n_components=2) |
|
elif method == "tsne": |
|
pca_reducer = PCA(n_components=pca_dim) |
|
all_embeddings = pca_reducer.fit_transform(all_embeddings) |
|
reducer = TSNE(n_components=2, random_state=42, perplexity=perplexity) |
|
else: |
|
raise ValueError("Invalid method. Choose 'pca' or 'tsne'.") |
|
reduced_embeddings = reducer.fit_transform(all_embeddings) |
|
return reducer, reduced_embeddings |
|
|
|
|
|
def get_speaker_name(speaker_id): |
|
return speaker_id_to_name.get(speaker_id, f"Unknown ({speaker_id})") |
|
|
|
|
|
def generate_visualization( |
|
pca_reducer, |
|
reduced_embeddings, |
|
all_speaker_ids, |
|
new_embedding, |
|
predicted_speaker_id, |
|
visualization_method, |
|
perplexity, |
|
pca_dim, |
|
): |
|
if visualization_method == "pca": |
|
new_embedding_reduced = pca_reducer.transform(new_embedding.reshape(1, -1)) |
|
elif visualization_method == "tsne": |
|
combined_embeddings = np.vstack( |
|
[reduced_embeddings, new_embedding.reshape(1, -1)] |
|
) |
|
reducer = TSNE(n_components=2, random_state=42, perplexity=perplexity) |
|
combined_reduced = reducer.fit_transform(combined_embeddings) |
|
reduced_embeddings = combined_reduced[:-1] |
|
new_embedding_reduced = combined_reduced[-1].reshape(1, -1) |
|
else: |
|
raise ValueError("Invalid visualization method.") |
|
|
|
fig = go.Figure() |
|
fig.add_trace( |
|
go.Scatter( |
|
x=reduced_embeddings[:, 0], |
|
y=reduced_embeddings[:, 1], |
|
mode="markers", |
|
marker=dict(color="blue", size=8, opacity=0.5), |
|
text=[get_speaker_name(speaker_id) for speaker_id in all_speaker_ids], |
|
name="Other Speakers", |
|
) |
|
) |
|
|
|
if predicted_speaker_id in all_speaker_ids: |
|
predicted_speaker_index = list(all_speaker_ids).index(predicted_speaker_id) |
|
fig.add_trace( |
|
go.Scatter( |
|
x=[reduced_embeddings[predicted_speaker_index, 0]], |
|
y=[reduced_embeddings[predicted_speaker_index, 1]], |
|
mode="markers", |
|
marker=dict( |
|
color="green", |
|
size=10, |
|
symbol="circle", |
|
line=dict(color="black", width=2), |
|
), |
|
name=get_speaker_name(predicted_speaker_id), |
|
text=[get_speaker_name(predicted_speaker_id)], |
|
) |
|
) |
|
|
|
fig.add_trace( |
|
go.Scatter( |
|
x=new_embedding_reduced[:, 0], |
|
y=new_embedding_reduced[:, 1], |
|
mode="markers", |
|
marker=dict(color="red", size=12, symbol="star"), |
|
name="New Voice", |
|
text=["New Voice"], |
|
) |
|
) |
|
|
|
fig.update_layout( |
|
title=f"Dimensionality Reduction of Speaker Embeddings using {visualization_method.upper()}", |
|
xaxis_title="Component 1", |
|
yaxis_title="Component 2", |
|
legend=dict(x=0, y=1, traceorder="normal", orientation="h"), |
|
hovermode="closest", |
|
) |
|
return fig |
|
|
|
|
|
def identify_voice_and_visualize_with_averaging(audio_file, perplexity=5): |
|
try: |
|
if isinstance(audio_file, str): |
|
signal, fs = librosa.load(audio_file, sr=16000) |
|
elif isinstance(audio_file, np.ndarray): |
|
signal = audio_file |
|
fs = 16000 |
|
else: |
|
raise ValueError( |
|
"Invalid audio input. Must be a file path or a NumPy array." |
|
) |
|
|
|
signal_tensor = torch.tensor(signal, dtype=torch.float32).unsqueeze(0) |
|
signal_tensor = torch.nn.functional.pad( |
|
signal_tensor, (0, max_length - signal_tensor.shape[1]) |
|
) |
|
|
|
user_embedding = classifier.encode_batch(signal_tensor).cpu().detach().numpy() |
|
user_embedding = normalize( |
|
user_embedding.squeeze(axis=(0, 1)).reshape(1, -1) |
|
).flatten() |
|
|
|
similarity_scores = { |
|
spk_id: cosine_similarity( |
|
user_embedding.reshape(1, -1), centroid.reshape(1, -1) |
|
)[0][0] |
|
for spk_id, centroid in speaker_centroids.items() |
|
} |
|
|
|
closest_speaker_ids = sorted( |
|
similarity_scores, key=similarity_scores.get, reverse=True |
|
)[:num_closest_speakers] |
|
predicted_speaker_id = closest_speaker_ids[0] |
|
highest_similarity = similarity_scores[predicted_speaker_id] |
|
|
|
all_embeddings, all_speaker_ids = prepare_data_for_visualization( |
|
speaker_centroids, closest_speaker_ids |
|
) |
|
reducer, reduced_embeddings = reduce_dimensionality( |
|
all_embeddings, |
|
method=visualization_method, |
|
perplexity=perplexity, |
|
pca_dim=pca_dim, |
|
) |
|
|
|
predicted_speaker_name = get_speaker_name(predicted_speaker_id) |
|
similarity_percentage = round(highest_similarity * 100, 2) |
|
|
|
visualization = generate_visualization( |
|
reducer, |
|
reduced_embeddings, |
|
all_speaker_ids, |
|
user_embedding, |
|
predicted_speaker_id, |
|
visualization_method, |
|
perplexity, |
|
pca_dim, |
|
) |
|
|
|
result_text = ( |
|
f"The voice resembles speaker: {predicted_speaker_name} " |
|
f"with a similarity of {similarity_percentage:.2f}%" |
|
) |
|
return result_text, visualization |
|
|
|
except Exception as e: |
|
return f"Error during processing: {e}", None |
|
|
|
|
|
|
|
cache_dir = "examples_cache" |
|
if not os.path.exists(cache_dir): |
|
os.makedirs(cache_dir) |
|
|
|
|
|
iface = gr.Interface( |
|
fn=identify_voice_and_visualize_with_averaging, |
|
inputs=gr.Audio(type="filepath", label="Input Audio"), |
|
outputs=["text", gr.Plot()], |
|
title="Discover Your Celebrity Voice Twin!", |
|
description="Record your voice or upload an audio file, and see your celebrity match! Not ready to record? Try our sample voices to see how it works!", |
|
cache_examples=False, |
|
examples_per_page=3, |
|
examples=[ |
|
[os.path.join(sample_audio_dir, sample_audio_files[0])], |
|
[os.path.join(sample_audio_dir, sample_audio_files[1])], |
|
[os.path.join(sample_audio_dir, sample_audio_files[2])], |
|
], |
|
) |
|
|
|
|
|
iface.launch(debug=True, share=True) |