KIMOSSINO commited on
Commit
9f9087b
·
verified ·
1 Parent(s): eb0019e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
3
+ import torch
4
+ import scipy
5
+
6
+ # Load models and processor
7
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
8
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
9
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
10
+
11
+ # Speaker embeddings
12
+ speaker_embeddings = torch.randn(1, 512) # Random speaker embedding
13
+
14
+ LANGUAGES = {
15
+ "English": "en",
16
+ "French": "fr",
17
+ "Spanish": "es"
18
+ }
19
+
20
+ def text_to_speech(text, language, speaker_type, speed):
21
+ try:
22
+ # Adjust speaker embeddings based on speaker type
23
+ if speaker_type == "Female":
24
+ speaker_embeddings = torch.randn(1, 512) * 0.8
25
+ else:
26
+ speaker_embeddings = torch.randn(1, 512) * 1.2
27
+
28
+ # Process input text
29
+ inputs = processor(text=text, return_tensors="pt")
30
+
31
+ # Generate speech
32
+ speech = model.generate_speech(
33
+ inputs["input_ids"],
34
+ speaker_embeddings,
35
+ vocoder=vocoder
36
+ )
37
+
38
+ # Adjust speed
39
+ if speed != 1.0:
40
+ speech = torch.nn.functional.interpolate(
41
+ speech.unsqueeze(0).unsqueeze(0),
42
+ scale_factor=1/speed,
43
+ mode='linear',
44
+ align_corners=False
45
+ ).squeeze()
46
+
47
+ # Convert to numpy array
48
+ speech = speech.numpy()
49
+
50
+ return (24000, speech) # 24kHz sampling rate
51
+ except Exception as e:
52
+ print(f"Error in text_to_speech: {str(e)}")
53
+ return None
54
+
55
+ # Create Gradio interface
56
+ def create_interface():
57
+ with gr.Blocks(theme=gr.themes.Soft(
58
+ primary_hue="blue",
59
+ secondary_hue="gray",
60
+ )) as demo:
61
+ gr.Markdown(
62
+ """
63
+ # 🎙️ Multilingual Text-to-Speech
64
+ Convert text to natural-sounding speech in multiple languages.
65
+ """
66
+ )
67
+
68
+ with gr.Row():
69
+ with gr.Column():
70
+ text_input = gr.Textbox(
71
+ label="Enter Text",
72
+ placeholder="Type your text here...",
73
+ lines=5
74
+ )
75
+ language = gr.Dropdown(
76
+ choices=list(LANGUAGES.keys()),
77
+ value="English",
78
+ label="Language"
79
+ )
80
+ speaker = gr.Radio(
81
+ choices=["Male", "Female"],
82
+ value="Male",
83
+ label="Speaker Gender"
84
+ )
85
+ speed = gr.Slider(
86
+ minimum=0.5,
87
+ maximum=2.0,
88
+ value=1.0,
89
+ step=0.1,
90
+ label="Speech Speed"
91
+ )
92
+ submit_btn = gr.Button("Generate Speech", variant="primary")
93
+
94
+ with gr.Column():
95
+ audio_output = gr.Audio(
96
+ label="Generated Speech",
97
+ type="numpy"
98
+ )
99
+
100
+ submit_btn.click(
101
+ fn=text_to_speech,
102
+ inputs=[text_input, language, speaker, speed],
103
+ outputs=audio_output
104
+ )
105
+
106
+ gr.Markdown(
107
+ """
108
+ ### Features:
109
+ - Support for English, French, and Spanish
110
+ - Male and Female voice options
111
+ - Adjustable speech speed
112
+ - High-quality, natural-sounding voices
113
+ """
114
+ )
115
+
116
+ return demo
117
+
118
+ demo = create_interface()
119
+ demo.launch()