|
|
|
|
|
|
|
import gradio as gr |
|
import torch |
|
import torchaudio |
|
import tempfile |
|
from vocos import Vocos |
|
from pydub import AudioSegment, silence |
|
from model import CFM, UNetT |
|
from cached_path import cached_path |
|
from model.utils import ( |
|
load_checkpoint, |
|
get_tokenizer, |
|
save_spectrogram, |
|
) |
|
from transformers import pipeline |
|
import soundfile as sf |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
print(f"Using {device} device") |
|
|
|
pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model="openai/whisper-large-v3", |
|
torch_dtype=torch.float16, |
|
device=device, |
|
) |
|
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz") |
|
|
|
|
|
target_sample_rate = 24000 |
|
n_mel_channels = 100 |
|
hop_length = 256 |
|
target_rms = 0.1 |
|
nfe_step = 32 |
|
cfg_strength = 2.0 |
|
ode_method = "euler" |
|
sway_sampling_coef = -1.0 |
|
speed = 1.0 |
|
|
|
def load_model(): |
|
model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4) |
|
ckpt_path = str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors")) |
|
vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin") |
|
model = CFM( |
|
transformer=UNetT( |
|
**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels |
|
), |
|
mel_spec_kwargs=dict( |
|
target_sample_rate=target_sample_rate, |
|
n_mel_channels=n_mel_channels, |
|
hop_length=hop_length, |
|
), |
|
odeint_kwargs=dict( |
|
method=ode_method, |
|
), |
|
vocab_char_map=vocab_char_map, |
|
).to(device) |
|
model = load_checkpoint(model, ckpt_path, device, use_ema=True) |
|
return model |
|
|
|
model = load_model() |
|
|
|
|
|
|
|
def infer(ref_audio, ref_text, gen_text, remove_silence, progress=gr.Progress()): |
|
progress(0, desc="Processing audio...") |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: |
|
aseg = AudioSegment.from_file(ref_audio) |
|
non_silent_segs = silence.split_on_silence( |
|
aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000 |
|
) |
|
non_silent_wave = AudioSegment.silent(duration=0) |
|
for non_silent_seg in non_silent_segs: |
|
non_silent_wave += non_silent_seg |
|
aseg = non_silent_wave |
|
audio_duration = len(aseg) |
|
if audio_duration > 15000: |
|
gr.Warning("Audio is over 15s, clipping to only first 15s.") |
|
aseg = aseg[:15000] |
|
aseg.export(f.name, format="wav") |
|
ref_audio = f.name |
|
|
|
progress(20, desc="Transcribing audio...") |
|
if not ref_text.strip(): |
|
ref_text = pipe( |
|
ref_audio, |
|
chunk_length_s=30, |
|
batch_size=128, |
|
generate_kwargs={"task": "transcribe"}, |
|
return_timestamps=False, |
|
)["text"].strip() |
|
|
|
if not ref_text.endswith(". "): |
|
ref_text += ". " if not ref_text.endswith(".") else " " |
|
|
|
progress(40, desc="Generating audio...") |
|
audio, sr = torchaudio.load(ref_audio) |
|
if audio.shape[0] > 1: |
|
audio = torch.mean(audio, dim=0, keepdim=True) |
|
rms = torch.sqrt(torch.mean(torch.square(audio))) |
|
if rms < target_rms: |
|
audio = audio * target_rms / rms |
|
if sr != target_sample_rate: |
|
resampler = torchaudio.transforms.Resample(sr, target_sample_rate) |
|
audio = resampler(audio) |
|
audio = audio.to(device) |
|
|
|
text_list = [ref_text + gen_text] |
|
duration = audio.shape[-1] // hop_length + int(audio.shape[-1] / hop_length / len(ref_text) * len(gen_text) / speed) |
|
|
|
progress(60, desc="Synthesizing speech...") |
|
with torch.inference_mode(): |
|
generated, _ = model.sample( |
|
cond=audio, |
|
text=text_list, |
|
duration=duration, |
|
steps=nfe_step, |
|
cfg_strength=cfg_strength, |
|
sway_sampling_coef=sway_sampling_coef, |
|
) |
|
|
|
generated = generated.to(torch.float32) |
|
generated = generated[:, audio.shape[-1] // hop_length:, :] |
|
generated_mel_spec = generated.permute(0, 2, 1) |
|
generated_wave = vocos.decode(generated_mel_spec.cpu()) |
|
if rms < target_rms: |
|
generated_wave = generated_wave * rms / target_rms |
|
|
|
generated_wave = generated_wave.squeeze().cpu().numpy() |
|
|
|
progress(80, desc="Post-processing...") |
|
if remove_silence: |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: |
|
sf.write(f.name, generated_wave, target_sample_rate) |
|
aseg = AudioSegment.from_file(f.name) |
|
non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500) |
|
non_silent_wave = AudioSegment.silent(duration=0) |
|
for non_silent_seg in non_silent_segs: |
|
non_silent_wave += non_silent_seg |
|
aseg = non_silent_wave |
|
aseg.export(f.name, format="wav") |
|
generated_wave, _ = torchaudio.load(f.name) |
|
generated_wave = generated_wave.squeeze().cpu().numpy() |
|
|
|
progress(90, desc="Generating spectrogram...") |
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram: |
|
spectrogram_path = tmp_spectrogram.name |
|
save_spectrogram(generated_mel_spec[0].cpu().numpy(), spectrogram_path) |
|
|
|
progress(100, desc="Done!") |
|
return (target_sample_rate, generated_wave), spectrogram_path |
|
|
|
custom_css = """ |
|
|
|
/* Dark theme customization */ |
|
:root { |
|
--background-fill-primary: #1a1a1a !important; |
|
--background-fill-secondary: #2d2d2d !important; |
|
--border-color-primary: #404040 !important; |
|
--text-color: #ffffff !important; |
|
--body-text-color: #ffffff !important; |
|
--color-accent-soft: #3d4c7d !important; |
|
} |
|
|
|
body { |
|
background-color: #1a1a1a !important; |
|
color: #ffffff !important; |
|
} |
|
|
|
.gradio-container { |
|
background-color: #1a1a1a !important; |
|
} |
|
|
|
.tabs { |
|
background-color: #2d2d2d !important; |
|
} |
|
|
|
.tab-selected { |
|
background-color: #404040 !important; |
|
} |
|
|
|
#logo-column { |
|
display: flex; |
|
justify-content: flex-end; |
|
align-items: flex-start; |
|
background-color: transparent !important; |
|
} |
|
|
|
#logo-column img { |
|
max-width: 180px; |
|
height: auto; |
|
margin-top: 10px; |
|
filter: brightness(0.9); |
|
} |
|
|
|
.gr-box { |
|
background-color: #2d2d2d !important; |
|
border: 1px solid #404040 !important; |
|
} |
|
|
|
.gr-button { |
|
background-color: #3d4c7d !important; |
|
color: white !important; |
|
} |
|
|
|
.gr-button:hover { |
|
background-color: #4a5d99 !important; |
|
} |
|
|
|
/* Modified input styling for darker background */ |
|
.gr-input, .gr-textarea { |
|
background-color: #1a1a1a !important; |
|
color: white !important; |
|
border: 1px solid #404040 !important; |
|
} |
|
|
|
#step-2-input textarea { |
|
background-color: #ffffff !important; |
|
color: #000000 !important; |
|
border-color: #404040 !important; |
|
} |
|
|
|
#step-2-input textarea:focus { |
|
border-color: #3d4c7d !important; |
|
box-shadow: 0 0 0 2px rgba(61, 76, 125, 0.2) !important; |
|
} |
|
|
|
#reference-text-input textarea { |
|
background-color: #fffff !important; |
|
color: #000000!important; |
|
border-color: #404040 !important; |
|
} |
|
|
|
#reference-text-input textarea:focus { |
|
border-color: #3d4c7d !important; |
|
box-shadow: 0 0 0 2px rgba(61, 76, 125, 0.2) !important; |
|
} |
|
|
|
.gr-accordion { |
|
background-color: #2d2d2d !important; |
|
} |
|
|
|
.gr-form { |
|
background-color: transparent !important; |
|
} |
|
|
|
.markdown-text { |
|
color: #ffffff !important; |
|
} |
|
|
|
.markdown-text h1, .markdown-text h2, .markdown-text h3 { |
|
color: #ffffff !important; |
|
} |
|
|
|
.audio-player { |
|
background-color: #2d2d2d !important; |
|
border: 1px solid #404040 !important; |
|
} |
|
|
|
""" |
|
|
|
custom_theme = gr.themes.Soft( |
|
primary_hue="indigo", |
|
secondary_hue="slate", |
|
neutral_hue="slate", |
|
font=gr.themes.GoogleFont("Inter"), |
|
).set( |
|
body_background_fill="#1a1a1a", |
|
body_background_fill_dark="#1a1a1a", |
|
body_text_color="#ffffff", |
|
body_text_color_dark="#ffffff", |
|
background_fill_primary="#2d2d2d", |
|
background_fill_primary_dark="#2d2d2d", |
|
background_fill_secondary="#1a1a1a", |
|
background_fill_secondary_dark="#1a1a1a", |
|
border_color_primary="#404040", |
|
border_color_primary_dark="#404040", |
|
button_primary_background_fill="#3d4c7d", |
|
button_primary_background_fill_dark="#3d4c7d", |
|
button_primary_text_color="#ffffff", |
|
button_primary_text_color_dark="#ffffff", |
|
) |
|
|
|
|
|
with gr.Blocks(theme=custom_theme, css=custom_css) as app: |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=9): |
|
gr.Markdown( |
|
""" |
|
# Antriksh AI |
|
|
|
Welcome to our voice cloning application! Follow these steps to create your own custom voice: |
|
|
|
1. Upload a short audio clip (less than 15 seconds) of the voice you want to clone. |
|
2. Enter the text you want to generate in the new voice. |
|
3. Click "Synthesize" and listen to hear the magic! |
|
|
|
It's that easy! Let's get started. |
|
""" |
|
) |
|
|
|
with gr.Column(scale=1, elem_id="logo-column"): |
|
gr.Image("logo/logo.jpg", label="", show_label=False) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
ref_audio_input = gr.Audio( |
|
label="Step 1: Upload Reference Audio", |
|
type="filepath", |
|
elem_classes="audio-player" |
|
) |
|
gen_text_input = gr.Textbox( |
|
label="Step 2: Enter Text to Generate", |
|
lines=5, |
|
elem_id="step-2-input", |
|
elem_classes="gr-textarea" |
|
) |
|
generate_btn = gr.Button( |
|
"Step 3: Synthesize", |
|
variant="primary", |
|
elem_classes="gr-button" |
|
) |
|
|
|
with gr.Column(scale=1): |
|
audio_output = gr.Audio( |
|
label="Generated Audio", |
|
elem_classes="audio-player" |
|
) |
|
spectrogram_output = gr.Image(label="Spectrogram") |
|
|
|
with gr.TabItem("Advanced Settings"): |
|
gr.Markdown( |
|
"These settings are optional. If you're not sure, leave them as they are." |
|
) |
|
ref_text_input = gr.Textbox( |
|
label="Reference Text (Optional)", |
|
info="Leave blank for automatic transcription.", |
|
lines=2, |
|
elem_id="reference-text-input", |
|
elem_classes="gr-textarea" |
|
) |
|
remove_silence = gr.Checkbox( |
|
label="Remove Silences", |
|
info="This can improve the quality of longer audio clips.", |
|
value=True |
|
) |
|
|
|
generate_btn.click( |
|
infer, |
|
inputs=[ |
|
ref_audio_input, |
|
ref_text_input, |
|
gen_text_input, |
|
remove_silence, |
|
], |
|
outputs=[audio_output, spectrogram_output], |
|
) |
|
|
|
with gr.TabItem("How It Works"): |
|
gr.Markdown( |
|
""" |
|
# How Voice Cloning Works |
|
|
|
Our voice cloning system uses advanced AI technology to create a synthetic voice that sounds like the reference audio you provide. Here's a simplified explanation of the process: |
|
|
|
1. **Audio Analysis**: When you upload a reference audio clip, our system analyzes its unique characteristics, including pitch, tone, and speech patterns. |
|
|
|
2. **Text Processing**: The text you want to generate is processed and converted into a format that our AI model can understand. |
|
|
|
3. **Voice Synthesis**: Our AI model, based on the E2-TTS (Embarrassingly Easy Text-to-Speech) architecture, combines the characteristics of the reference audio with the new text to generate a synthetic voice. |
|
|
|
4. **Audio Generation**: The synthetic voice is converted into an audio waveform, which you can then play back or download. |
|
|
|
5. **Spectrogram Creation**: A visual representation of the audio (spectrogram) is generated, showing the frequency content of the sound over time. |
|
|
|
This process allows you to generate new speech in the voice of the reference audio, even saying things that weren't in the original recording. It's a powerful tool for creating custom voiceovers, audiobooks, or just for fun! |
|
|
|
Remember, the quality of the output depends on the quality and length of the input audio. For best results, use a clear, high-quality audio clip of 10-15 seconds in length. |
|
""" |
|
) |
|
|
|
<<<<<<< HEAD |
|
======= |
|
|
|
gen_text_input_emotional = gr.Textbox(label="Text to Generate", lines=10) |
|
|
|
|
|
model_choice_emotional = gr.Radio( |
|
choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS" |
|
) |
|
|
|
with gr.Accordion("Advanced Settings", open=False): |
|
remove_silence_emotional = gr.Checkbox( |
|
label="Remove Silences", |
|
value=True, |
|
) |
|
|
|
|
|
generate_emotional_btn = gr.Button("Generate Emotional Speech", variant="primary") |
|
|
|
|
|
audio_output_emotional = gr.Audio(label="Synthesized Audio") |
|
@gpu_decorator |
|
def generate_emotional_speech( |
|
regular_audio, |
|
regular_ref_text, |
|
gen_text, |
|
*args, |
|
): |
|
num_additional_speech_types = max_speech_types - 1 |
|
speech_type_names_list = args[:num_additional_speech_types] |
|
speech_type_audios_list = args[num_additional_speech_types:2 * num_additional_speech_types] |
|
speech_type_ref_texts_list = args[2 * num_additional_speech_types:3 * num_additional_speech_types] |
|
model_choice = args[3 * num_additional_speech_types] |
|
remove_silence = args[3 * num_additional_speech_types + 1] |
|
|
|
|
|
speech_types = {'Regular': {'audio': regular_audio, 'ref_text': regular_ref_text}} |
|
|
|
for name_input, audio_input, ref_text_input in zip(speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list): |
|
if name_input and audio_input: |
|
speech_types[name_input] = {'audio': audio_input, 'ref_text': ref_text_input} |
|
|
|
|
|
segments = parse_speechtypes_text(gen_text) |
|
|
|
|
|
generated_audio_segments = [] |
|
current_emotion = 'Regular' |
|
|
|
for segment in segments: |
|
emotion = segment['emotion'] |
|
text = segment['text'] |
|
|
|
if emotion in speech_types: |
|
current_emotion = emotion |
|
else: |
|
|
|
current_emotion = 'Regular' |
|
|
|
ref_audio = speech_types[current_emotion]['audio'] |
|
ref_text = speech_types[current_emotion].get('ref_text', '') |
|
|
|
|
|
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, 0) |
|
sr, audio_data = audio |
|
|
|
generated_audio_segments.append(audio_data) |
|
|
|
|
|
if generated_audio_segments: |
|
final_audio_data = np.concatenate(generated_audio_segments) |
|
return (sr, final_audio_data) |
|
else: |
|
gr.Warning("No audio generated.") |
|
return None |
|
|
|
generate_emotional_btn.click( |
|
generate_emotional_speech, |
|
inputs=[ |
|
regular_audio, |
|
regular_ref_text, |
|
gen_text_input_emotional, |
|
] + speech_type_names + speech_type_audios + speech_type_ref_texts + [ |
|
model_choice_emotional, |
|
remove_silence_emotional, |
|
], |
|
outputs=audio_output_emotional, |
|
) |
|
|
|
|
|
def validate_speech_types( |
|
gen_text, |
|
regular_name, |
|
*args |
|
): |
|
num_additional_speech_types = max_speech_types - 1 |
|
speech_type_names_list = args[:num_additional_speech_types] |
|
|
|
|
|
speech_types_available = set() |
|
if regular_name: |
|
speech_types_available.add(regular_name) |
|
for name_input in speech_type_names_list: |
|
if name_input: |
|
speech_types_available.add(name_input) |
|
|
|
|
|
segments = parse_emotional_text(gen_text) |
|
speech_types_in_text = set(segment['emotion'] for segment in segments) |
|
|
|
|
|
missing_speech_types = speech_types_in_text - speech_types_available |
|
|
|
if missing_speech_types: |
|
|
|
return gr.update(interactive=False) |
|
else: |
|
|
|
return gr.update(interactive=True) |
|
|
|
gen_text_input_emotional.change( |
|
validate_speech_types, |
|
inputs=[gen_text_input_emotional, regular_name] + speech_type_names, |
|
outputs=generate_emotional_btn |
|
) |
|
with gr.Blocks() as app: |
|
gr.Markdown( |
|
""" |
|
# Antriksh AI |
|
""" |
|
) |
|
|
|
|
|
gr.Image( |
|
value="logo.jpg", |
|
label="AI System Logo", |
|
show_label=False, |
|
width=300, |
|
height=150 |
|
) |
|
|
|
gr.TabbedInterface([app_tts, app_podcast, app_emotional, app_credits], ["TTS", "Podcast", "Multi-Style", "Credits"]) |
|
|
|
|
|
@click.command() |
|
@click.option("--port", "-p", default=None, type=int, help="Port to run the app on") |
|
@click.option("--host", "-H", default=None, help="Host to run the app on") |
|
@click.option( |
|
"--share", |
|
"-s", |
|
default=False, |
|
is_flag=True, |
|
help="Share the app via Gradio share link", |
|
) |
|
@click.option("--api", "-a", default=True, is_flag=True, help="Allow API access") |
|
def main(port, host, share, api): |
|
global app |
|
print(f"Starting app...") |
|
app.queue(api_open=api).launch( |
|
server_name=host, server_port=port, share=share, show_api=api |
|
) |
|
|
|
|
|
>>>>>>> 3c3b34b0ce3a85c2e202414d6764288cad249a97 |
|
if __name__ == "__main__": |
|
app.launch(share=True) |