Spaces:
Build error
Build error
Create main.py
Browse files
main.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
3 |
+
from datasets import load_dataset
|
4 |
+
import torch
|
5 |
+
import soundfile as sf
|
6 |
+
import os
|
7 |
+
|
8 |
+
# Function to generate speech using the pipeline method
|
9 |
+
def generate_speech_pipeline(text, speaker_embedding):
|
10 |
+
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
|
11 |
+
speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
|
12 |
+
return speech["audio"], speech["sampling_rate"]
|
13 |
+
|
14 |
+
# Function to generate speech using the processor + generate method
|
15 |
+
def generate_speech_processor(text, speaker_embedding):
|
16 |
+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
17 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
18 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
19 |
+
|
20 |
+
inputs = processor(text=text, return_tensors="pt")
|
21 |
+
speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
|
22 |
+
return speech.numpy(), 16000
|
23 |
+
|
24 |
+
def main():
|
25 |
+
st.title("Text-to-Speech with SpeechT5")
|
26 |
+
|
27 |
+
st.write("Enter the text you want to convert to speech:")
|
28 |
+
|
29 |
+
text = st.text_area("Text", "Hello, my dog is cooler than you!")
|
30 |
+
|
31 |
+
if st.button("Generate Speech"):
|
32 |
+
st.write("Generating speech...")
|
33 |
+
|
34 |
+
# Load xvector containing speaker's voice characteristics from a dataset
|
35 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
36 |
+
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
37 |
+
|
38 |
+
# Choose the method to generate speech
|
39 |
+
method = st.selectbox("Choose the method for generating speech", ["Pipeline", "Processor + Generate"])
|
40 |
+
|
41 |
+
if method == "Pipeline":
|
42 |
+
audio, samplerate = generate_speech_pipeline(text, speaker_embedding)
|
43 |
+
else:
|
44 |
+
audio, samplerate = generate_speech_processor(text, speaker_embedding)
|
45 |
+
|
46 |
+
# Save and play the generated speech
|
47 |
+
output_path = "speech.wav"
|
48 |
+
sf.write(output_path, audio, samplerate=samplerate)
|
49 |
+
st.audio(output_path)
|
50 |
+
|
51 |
+
if __name__ == "__main__":
|
52 |
+
main()
|