Spaces:
Runtime error
Runtime error
File size: 3,476 Bytes
810b795 4d1dde6 4c84c16 4d1dde6 b2605d1 4d1dde6 b2605d1 4d1dde6 810b795 4c84c16 b2605d1 4c84c16 b2605d1 0e826bb 4c84c16 e8b9495 4c84c16 e8b9495 4c84c16 e8b9495 0e826bb e8b9495 0e826bb e8b9495 0e826bb e8b9495 0e826bb e8b9495 0e826bb e8b9495 0e826bb 4c84c16 b2605d1 448b3f1 b863139 b2605d1 4c84c16 0e826bb e600ac8 b863139 b2605d1 4c84c16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import gradio as gr
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
import torch
import os
import io
import base64
import numpy as np
from pydub import AudioSegment
os.environ['TRANSFORMERS_CACHE'] = '.cache'
print ("----- setting up pipeline -----")
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
print ("----- setting up dataset -----")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# You can replace this embedding with your own as well.
print ("----- synthetizing audio -----")
#speech = synthesiser("Hello, my dog is cooler than you!", forward_params={"speaker_embeddings": speaker_embedding})
#sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
def greet(name):
return "Hello " + name + "!!"
def synthesise_audio(text, forward_params=None):
if len(text) > 100:
raise ValueError("Error: El texto es demasiado largo. Por favor, limita tu entrada a 100 caracteres.")
speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
return "speech.wav"
# sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
# return "speech.wav"
# Convert numpy array to audio
#with io.BytesIO() as f:
# sf.write(f, speech["audio"], samplerate=speech["sampling_rate"], format='wav')
# audio = f.getvalue()
# Convert numpy array to audio
#audio = np.int16(speech["audio"] * 32767)
#audio_segment = AudioSegment(audio, sample_width=2, frame_rate=speech["sampling_rate"], channels=1)
# Convert numpy array to list
#audio = speech["audio"]
#return speech["audio"]
#return audio
# Ensure audio is a numpy array
#if isinstance(speech["audio"], int):
# audio = np.array([speech["audio"]])
#else:
# audio = speech["audio"]
# Create an in-memory buffer to store the audio data
#print("Creating in-memory buffer")
#audio_buffer = io.BytesIO()
# Write the audio data to the in-memory buffer
#print("Writing audio data to in-memory buffer")
#sf.write(audio_buffer, speech["audio"], samplerate=speech["sampling_rate"], format="WAV")
# Move the buffer cursor to the beginning of the buffer
#audio_buffer.seek(0)
# Read the audio data from the in-memory buffer into a numpy array
#print("Reading audio data from in-memory buffer")
#audio, sr = sf.read(audio_buffer)
#print("Audio data read from in-memory buffer, returning audio data and sample rate")
# Ensure audio is a numpy array before returning
#audio = np.array(audio)
#return audio, sr
#demo = gr.Interface(fn=greet, inputs="text", outputs="text", description="----- TTS Testing -----")
input_text = gr.Textbox(lines=10, label="Type the text you want to convert to speech:")
#examples = gr.Examples(inputs=[["Feliz cumpleaños Nuria"]], outputs=[[]])
demo = gr.Interface(fn=synthesise_audio,
inputs=input_text,
outputs="audio",
#outputs = gr.Audio(type="numpy"),
description="----- manuai Text To Speech generator test -----",
allow_flagging = False)
demo.launch(debug = True)
|