KIMOSSINO commited on
Commit
b8b580d
·
verified ·
1 Parent(s): 16e8067

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -84
app.py CHANGED
@@ -1,118 +1,87 @@
1
  import gradio as gr
2
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
3
  import torch
4
- import scipy
 
 
5
 
6
- # Load models and processor
7
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
8
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
9
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
10
 
11
- # Speaker embeddings
12
- speaker_embeddings = torch.randn(1, 512) # Random speaker embedding
13
 
14
- LANGUAGES = {
15
- "English": "en",
16
- "French": "fr",
17
- "Spanish": "es"
18
- }
 
 
 
 
 
 
 
 
19
 
20
  def text_to_speech(text, language, speaker_type, speed):
21
  try:
22
- # Adjust speaker embeddings based on speaker type
23
- if speaker_type == "Female":
24
- speaker_embeddings = torch.randn(1, 512) * 0.8
25
- else:
26
- speaker_embeddings = torch.randn(1, 512) * 1.2
27
-
28
- # Process input text
29
  inputs = processor(text=text, return_tensors="pt")
30
-
31
- # Generate speech
32
- speech = model.generate_speech(
33
- inputs["input_ids"],
34
- speaker_embeddings,
35
  vocoder=vocoder
36
- )
37
 
38
- # Adjust speed
39
- if speed != 1.0:
40
- speech = torch.nn.functional.interpolate(
41
- speech.unsqueeze(0).unsqueeze(0),
42
- scale_factor=1/speed,
43
- mode='linear',
44
- align_corners=False
45
- ).squeeze()
46
-
47
- # Convert to numpy array
48
- speech = speech.numpy()
49
 
50
- return (24000, speech) # 24kHz sampling rate
51
  except Exception as e:
52
- print(f"Error in text_to_speech: {str(e)}")
53
- return None
54
 
55
- # Create Gradio interface
56
  def create_interface():
57
- with gr.Blocks(theme=gr.themes.Soft(
58
- primary_hue="blue",
59
- secondary_hue="gray",
60
- )) as demo:
61
- gr.Markdown(
62
- """
63
- # 🎙️ Multilingual Text-to-Speech
64
- Convert text to natural-sounding speech in multiple languages.
65
- """
66
- )
67
 
68
  with gr.Row():
69
  with gr.Column():
70
- text_input = gr.Textbox(
71
- label="Enter Text",
72
- placeholder="Type your text here...",
73
- lines=5
74
- )
75
- language = gr.Dropdown(
76
- choices=list(LANGUAGES.keys()),
77
- value="English",
78
- label="Language"
79
- )
80
- speaker = gr.Radio(
81
- choices=["Male", "Female"],
82
- value="Male",
83
- label="Speaker Gender"
84
- )
85
- speed = gr.Slider(
86
- minimum=0.5,
87
- maximum=2.0,
88
- value=1.0,
89
- step=0.1,
90
- label="Speech Speed"
91
- )
92
  submit_btn = gr.Button("Generate Speech", variant="primary")
93
 
94
  with gr.Column():
95
- audio_output = gr.Audio(
96
- label="Generated Speech",
97
- type="numpy"
98
- )
99
-
100
  submit_btn.click(
101
  fn=text_to_speech,
102
  inputs=[text_input, language, speaker, speed],
103
  outputs=audio_output
104
  )
105
 
106
- gr.Markdown(
107
- """
108
- ### Features:
109
- - Support for English, French, and Spanish
110
- - Male and Female voice options
111
- - Adjustable speech speed
112
- - High-quality, natural-sounding voices
113
- """
114
- )
115
-
116
  return demo
117
 
118
  demo = create_interface()
 
1
  import gradio as gr
2
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
3
  import torch
4
+ import librosa
5
+ import numpy as np
6
+ from scipy.io.wavfile import write
7
 
8
+ # تحميل النماذج والمُعالج
9
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
10
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
11
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
12
 
13
+ # تعيين القيم الافتراضية لمتغيرات الصوت
14
+ LANGUAGES = {"English": "en", "French": "fr", "Spanish": "es"}
15
 
16
+ def generate_speaker_embedding(speaker_type):
17
+ """توليد تعبيرات الصوت بناءً على النوع"""
18
+ base_embedding = torch.randn(1, 512)
19
+ if speaker_type == "Female":
20
+ return base_embedding * 0.8
21
+ else: # Male
22
+ return base_embedding * 1.2
23
+
24
+ def adjust_speed(audio, speed):
25
+ """تعديل سرعة الصوت باستخدام مكتبة librosa"""
26
+ if speed == 1.0:
27
+ return audio
28
+ return librosa.effects.time_stretch(audio, speed)
29
 
30
  def text_to_speech(text, language, speaker_type, speed):
31
  try:
32
+ # إنشاء تعبيرات الصوت
33
+ speaker_embeddings = generate_speaker_embedding(speaker_type)
34
+
35
+ # معالجة النص
 
 
 
36
  inputs = processor(text=text, return_tensors="pt")
37
+
38
+ # توليد الصوت
39
+ generated_speech = model.generate_speech(
40
+ inputs["input_ids"],
41
+ speaker_embeddings,
42
  vocoder=vocoder
43
+ ).cpu().numpy()
44
 
45
+ # ضبط سرعة الصوت
46
+ adjusted_speech = adjust_speed(generated_speech, speed)
47
+
48
+ # تحويل الصوت إلى ملف WAV
49
+ output_file = "output.wav"
50
+ write(output_file, 24000, adjusted_speech.astype(np.float32))
 
 
 
 
 
51
 
52
+ return output_file
53
  except Exception as e:
54
+ return f"Error: {str(e)}"
 
55
 
56
+ # إنشاء واجهة Gradio
57
  def create_interface():
58
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
59
+ gr.Markdown("# 🎙️ Multilingual Text-to-Speech")
 
 
 
 
 
 
 
 
60
 
61
  with gr.Row():
62
  with gr.Column():
63
+ text_input = gr.Textbox(label="Enter Text", placeholder="Type your text here...", lines=5)
64
+ language = gr.Dropdown(choices=list(LANGUAGES.keys()), value="English", label="Language")
65
+ speaker = gr.Radio(choices=["Male", "Female"], value="Male", label="Speaker Gender")
66
+ speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speech Speed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  submit_btn = gr.Button("Generate Speech", variant="primary")
68
 
69
  with gr.Column():
70
+ audio_output = gr.Audio(label="Generated Speech", type="filepath")
71
+
 
 
 
72
  submit_btn.click(
73
  fn=text_to_speech,
74
  inputs=[text_input, language, speaker, speed],
75
  outputs=audio_output
76
  )
77
 
78
+ gr.Markdown("""
79
+ ### Features:
80
+ - Multilingual support: English, French, and Spanish.
81
+ - Male and Female voice options.
82
+ - Adjustable speech speed.
83
+ - High-quality, natural-sounding voices.
84
+ """)
 
 
 
85
  return demo
86
 
87
  demo = create_interface()