Spaces:

fffiloni
/

instant-TTS-Bark-cloning

Paused

App Files Files Community

instant-TTS-Bark-cloning / app.py

fffiloni

Update app.py

9d546f2 over 1 year ago

raw

history blame

4.61 kB

	import gradio as gr
	import os
	import shutil

	#from huggingface_hub import snapshot_download
	import numpy as np
	from scipy.io import wavfile
	"""
	model_ids = [
	'suno/bark',
	]

	for model_id in model_ids:
	model_name = model_id.split('/')[-1]
	snapshot_download(model_id, local_dir=f'checkpoints/{model_name}')

	from TTS.tts.configs.bark_config import BarkConfig
	from TTS.tts.models.bark import Bark

	#os.environ['CUDA_VISIBLE_DEVICES'] = '1'
	config = BarkConfig()
	model = Bark.init_from_config(config)
	model.load_checkpoint(config, checkpoint_dir="checkpoints/bark", eval=True)
	"""
	from TTS.api import TTS
	tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)

	def infer(prompt, input_wav_file):

	print("SAVING THE AUDIO FILE TO WHERE IT BELONGS")

	# Path to your WAV file
	source_path = input_wav_file

	# Destination directory
	destination_directory = "bark_voices"

	# Extract the file name without the extension
	file_name = os.path.splitext(os.path.basename(source_path))[0]

	# Construct the full destination directory path
	destination_path = os.path.join(destination_directory, file_name)

	# Create the new directory
	os.makedirs(destination_path, exist_ok=True)

	# Move the WAV file to the new directory
	shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav"))

	"""
	text = prompt

	print("SYNTHETIZING...")
	# with random speaker
	#output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None)

	# cloning a speaker.
	# It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz`
	output_dict = model.synthesize(
	text,
	config,
	speaker_id=f"{file_name}",
	voice_dirs="bark_voices/",
	gpu=True
	)

	print(output_dict)



	sample_rate = 24000 # Replace with the actual sample rate
	print("WRITING WAVE FILE")
	wavfile.write(
	'output.wav',
	sample_rate,
	output_dict['wav']
	)
	"""

	tts.tts_to_file(text=prompt,
	file_path="output.wav",
	voice_dir="bark_voices/",
	speaker=f"{file_name}")

	# List all the files and subdirectories in the given directory
	contents = os.listdir(f"bark_voices/{file_name}")

	# Print the contents
	for item in contents:
	print(item)

	tts_video = gr.make_waveform(audio="output.wav")

	return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True)


	css = """
	#col-container {max-width: 780px; margin-left: auto; margin-right: auto;}
	img[src*='#center'] {
	display: block;
	margin: auto;
	}
	"""

	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):

	gr.Markdown("""
	<h1 style="text-align: center;">Instant Voice Cloning</h1>
	<p style="text-align: center;">
	Clone any voice in less than 2 minutes with this <a href="https://tts.readthedocs.io/en/dev/models/bark.html" target="_blank">Coqui TSS + Bark</a> demo ! <br />
	Upload a clean 20 seconds WAV file of the voice you want to clone, <br />
	type your text-to-speech prompt and hit submit ! <br />
	</p>

	[![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm.svg#center)](https://huggingface.co/spaces/fffiloni/instant-TTS-Bark-cloning?duplicate=true)

	""")
	with gr.Row():
	with gr.Column():
	prompt = gr.Textbox(
	label="Text to speech prompt"
	)

	audio_in = gr.Audio(
	label="WAV voice to clone",
	type="filepath",
	source="upload"
	)

	submit_btn = gr.Button("Submit")

	with gr.Column():

	cloned_out = gr.Audio(
	label="Text to speech output"
	)

	video_out = gr.Video(
	label = "Waveform video"
	)

	npz_file = gr.File(
	label = ".npz file",
	visible = False
	)

	submit_btn.click(
	fn = infer,
	inputs = [
	prompt,
	audio_in
	],
	outputs = [
	cloned_out,
	video_out,
	npz_file
	]
	)

	demo.queue(max_size=20).launch()