Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,406 Bytes
d72812e 1b0b842 5a6cca7 648b3ba d72812e 1b0b842 d72812e e0f2f07 9f73839 d72812e 0adc8b9 d72812e 0249b26 d72812e 35053bd 1b0b842 799d009 1b0b842 35053bd 1b0b842 d72812e 175f658 34f6b5d 175f658 d72812e d2a76ad 1b0b842 35053bd 1b0b842 6372099 1b0b842 d72812e 35053bd d72812e 35053bd d72812e 1b0b842 e501dea 6c485e8 e501dea 35053bd e501dea 42d17d1 175f658 42d17d1 d72812e 35053bd d72812e 35053bd d72812e fa19b71 96b57d0 d72812e a848bf6 24b3333 35053bd d72812e 96b57d0 6372099 96b57d0 d72812e 1b0b842 d72812e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import concurrent
import os
import tempfile
from typing import Optional, Tuple
import numpy as np
import spaces
from transformers import pipeline
import gradio as gr
import torch
import torchaudio
from resemble_enhance.enhancer.inference import denoise, enhance
from flore200_codes import flores_codes
from tts import BambaraTTS
# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
# Translation pipeline
translation_model = "oza75/nllb-600M-mt-french-bambara"
# translation_model = "oza75/nllb-1.3B-mt-french-bambara"
translator = pipeline("translation", model=translation_model, max_length=512)
# Text-to-Speech pipeline
tts_model = "oza75/bambara-tts"
tts = BambaraTTS(tts_model)
# Function to translate text to Bambara
def translate_to_bambara(text, src_lang):
translation = translator(text, src_lang=src_lang, tgt_lang="bam_Latn")
return str(translation[0]['translation_text'])
# Function to convert text to speech
def text_to_speech(bambara_text, reference_speaker: str, reference_audio: Optional[Tuple] = None):
if reference_audio is not None:
ref_sr, ref_audio = reference_audio
ref_audio = torch.from_numpy(ref_audio)
# Add a channel dimension if the audio is 1D
if ref_audio.ndim == 1:
ref_audio = ref_audio.unsqueeze(0)
# Save the reference audio to a temporary file if it's not None
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
torchaudio.save(tmp.name, ref_audio, ref_sr)
tmp_path = tmp.name
# Use the temporary file as the speaker reference
sr, audio = tts.text_to_speech(bambara_text, speaker_reference_wav_path=tmp_path, enable_text_splitting=True)
# Clean up the temporary file
os.unlink(tmp_path)
else:
# If no reference audio provided, proceed with the reference_speaker
sr, audio = tts.text_to_speech(bambara_text, speaker_reference_wav_path=reference_speaker)
audio = audio.mean(dim=0)
return audio, sr
# Function to enhance speech
def enhance_speech(audio_array, sampling_rate, solver, nfe, tau, denoise_before_enhancement):
solver = solver.lower()
nfe = int(nfe)
lambd = 0.9 if denoise_before_enhancement else 0.1
def denoise_audio():
try:
return denoise(audio_array, sampling_rate, device)
except Exception as e:
print("> Error while denoising : ", str(e))
return audio_array, sampling_rate
def enhance_audio():
try:
return enhance(audio_array, sampling_rate, device, nfe=nfe, solver=solver, lambd=lambd, tau=tau)
except Exception as e:
print("> Error while enhancement : ", str(e))
return audio_array, sampling_rate
with concurrent.futures.ThreadPoolExecutor() as executor:
future_denoise = executor.submit(denoise_audio)
future_enhance = executor.submit(enhance_audio)
denoised_audio, new_sr1 = future_denoise.result()
enhanced_audio, new_sr2 = future_enhance.result()
# Convert to numpy and return
return (new_sr1, denoised_audio.cpu().numpy()), (new_sr2, enhanced_audio.cpu().numpy())
# Define the Gradio interface
@spaces.GPU
def _fn(
src_lang,
text,
reference_speaker,
reference_audio=None,
solver="Midpoint",
nfe=128,
prior_temp=0.01,
denoise_before_enhancement=False
):
source_lang = flores_codes[src_lang]
reference_speaker = os.path.join("./audios", reference_speaker)
# Step 1: Translate the text to Bambara
bambara_text = translate_to_bambara(text, source_lang)
yield bambara_text, None, None, None
# Step 2: Convert the translated text to speech with reference audio
if reference_audio is not None:
audio_array, sampling_rate = text_to_speech(bambara_text, reference_speaker, reference_audio)
else:
audio_array, sampling_rate = text_to_speech(bambara_text, reference_speaker=reference_speaker)
yield bambara_text, (sampling_rate, audio_array.numpy()), None, None
# Step 3: Enhance the audio
denoised_audio, enhanced_audio = enhance_speech(
audio_array,
sampling_rate,
solver,
nfe,
prior_temp,
denoise_before_enhancement
)
yield bambara_text, (sampling_rate, audio_array.numpy()), denoised_audio, enhanced_audio
def main():
lang_codes = list(flores_codes.keys())
# List all files in the ./audios directory for the dropdown
audio_files = [f for f in os.listdir('./audios') if os.path.isfile(os.path.join('./audios', f))]
# Build Gradio app
app = gr.Interface(
fn=_fn,
inputs=[
gr.Dropdown(label="Source Language", choices=lang_codes, value='French'),
gr.Textbox(label="Text to Translate", lines=3, value="Thomas Isidore Noël Sankara est fils d'un père Peul — originaire du village de Sitoèga dans le département de Bokin dans la province du Passoré — et d'une mère mossi, et grandit entre valeurs militaires et religiosité chrétienne."),
gr.Dropdown(label="Voice", choices=audio_files, value="male_3.wav"),
gr.Audio(label="Clone your voice (optional)", type="numpy", format="wav", elem_id="clone_voice_input"),
# gr.Dropdown(
# choices=["Midpoint", "RK4", "Euler"], value="Midpoint",
# label="ODE Solver (Midpoint is recommended)"
# ),
# gr.Slider(minimum=1, maximum=128, value=64, step=1, label="Number of Function Evaluations"),
# gr.Slider(minimum=0.1, maximum=1, value=0.5, step=0.01, label="Prior Temperature"),
# gr.Checkbox(value=False, label="Denoise Before Enhancement")
],
outputs=[
gr.Textbox(label="Translated Text"),
gr.Audio(label="Original TTS Audio", format='wav'),
gr.Audio(label="Denoised Audio", format='wav'),
gr.Audio(label="Enhanced Audio", format='wav')
],
examples=[
['French', "Mais il n'avait pas l'air content. Je lui ai même dit : « Ce n'est pas de ma faute. » Il n'a pas répondu. J'ai pensé alors que je n'aurais pas dû lui dire cela. En somme, je n'avais pas à m'excuser. C'était plutôt à lui de me présenter ses condoléances.", "male_3.wav", None],
['French', "l'asile de vieillards est à Marengo, à quatre-vingts kilomètres d'Alger. Je prendrai l'autobus à deux heures et j'arriverai dans l'après-midi. Ainsi, je pourrai veiller et je rentrerai demain soir. J'ai demandé deux jours de congé à mon patron et il ne pouvait pas me les refuser avec une excuse pareille", "male_3.wav", None],
['English', "Today, my mother is dead. Or maybe yesterday, I don't know. I received a telegram from the asylum: “Mother deceased. Hand burial. Distinguished feelings.“ It does not mean anything. Maybe it was yesterday.", "male_2.wav", None],
],
css="#clone_voice_input .audio-container button.boundedheight { height: 147px !important; }",
title="Bambara Translation and Text to Speech with Audio Enhancement",
description="Translate text to Bambara and convert it to speech with options to enhance audio quality."
)
app.launch(share=False)
if __name__ == "__main__":
main()
|