capradeepgujaran commited on
Commit
46acd9e
·
verified ·
1 Parent(s): 82519df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -46
app.py CHANGED
@@ -1,58 +1,107 @@
1
  import gradio as gr
2
- from gtts import gTTS
 
 
 
 
 
3
  import io
4
  import tempfile
5
- from pydub import AudioSegment
6
- import numpy as np
7
 
8
- def text_to_speech_with_emotion(text, emotion, language='en'):
9
- # Generate base speech
10
- tts = gTTS(text=text, lang=language, slow=False)
11
-
12
- with io.BytesIO() as fp:
13
- tts.write_to_fp(fp)
14
- fp.seek(0)
15
- audio = AudioSegment.from_mp3(fp)
16
-
17
- # Adjust audio based on emotion
 
 
 
 
 
 
 
 
 
 
18
  if emotion == "Happy":
19
- audio = audio.speedup(playback_speed=1.15)
20
- audio = audio.pitch_shift(semitones=1)
21
  elif emotion == "Sad":
22
- audio = audio.speedup(playback_speed=0.85)
23
- audio = audio.pitch_shift(semitones=-1)
24
  elif emotion == "Angry":
25
- audio = audio.speedup(playback_speed=1.1)
26
- audio = audio + 3 # Increase volume slightly
27
- # Neutral emotion remains unchanged
28
-
29
- # Apply some subtle enhancements
30
- audio = audio.compress_dynamic_range(threshold=-15, ratio=2.0, attack=5, release=50)
31
- audio = audio.high_pass_filter(80) # Remove very low frequencies
32
-
33
- return audio
34
 
35
- def generate_emotional_speech(text, emotion, language):
36
- audio = text_to_speech_with_emotion(text, emotion, language)
37
-
38
- # Normalize the final audio
39
- audio = audio.normalize()
40
-
41
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
42
- audio.export(fp.name, format="mp3", bitrate="192k") # Higher bitrate for better quality
43
- return fp.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  # Gradio interface
46
- iface = gr.Interface(
47
- fn=generate_emotional_speech,
48
- inputs=[
49
- gr.Textbox(label="Enter text for speech"),
50
- gr.Radio(["Neutral", "Happy", "Sad", "Angry"], label="Emotion", value="Neutral"),
51
- gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
52
- ],
53
- outputs=gr.Audio(label="Generated Emotional Speech"),
54
- title="Clean Emotional Text-to-Speech Generator",
55
- description="Generate clean speech with emotional variations without background sounds."
56
- )
 
 
 
 
 
 
 
57
 
58
  iface.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoProcessor, MusicgenForConditionalGeneration
3
+ import torch
4
+ from TTS.api import TTS
5
+ import scipy
6
+ import numpy as np
7
+ from pydub import AudioSegment
8
  import io
9
  import tempfile
 
 
10
 
11
+ # Initialize TTS model
12
+ try:
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(device)
15
+ except Exception as e:
16
+ print(f"Error initializing TTS model: {e}")
17
+ tts = None
18
+
19
+ # Initialize Musicgen model for sound generation
20
+ try:
21
+ processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
22
+ model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
23
+ model.to(device)
24
+ except Exception as e:
25
+ print(f"Error initializing Musicgen model: {e}")
26
+ processor = None
27
+ model = None
28
+
29
+ def apply_emotion(audio, emotion):
30
+ audio_segment = AudioSegment(audio.tobytes(), frame_rate=22050, sample_width=2, channels=1)
31
  if emotion == "Happy":
32
+ audio_segment = audio_segment.pitch_shift(1).speedup(playback_speed=1.1)
 
33
  elif emotion == "Sad":
34
+ audio_segment = audio_segment.pitch_shift(-1).speedup(playback_speed=0.9)
 
35
  elif emotion == "Angry":
36
+ audio_segment = audio_segment.pitch_shift(0.5).speedup(playback_speed=1.05)
37
+ return np.array(audio_segment.get_array_of_samples())
 
 
 
 
 
 
 
38
 
39
+ def generate_speech(text, emotion):
40
+ try:
41
+ if tts is not None:
42
+ speech = tts.tts(text=text)
43
+ speech_with_emotion = apply_emotion(speech, emotion)
44
+
45
+ # Improve audio quality
46
+ audio_segment = AudioSegment(speech_with_emotion.tobytes(), frame_rate=22050, sample_width=2, channels=1)
47
+ audio_segment = audio_segment.compress_dynamic_range()
48
+ audio_segment = audio_segment.normalize()
49
+
50
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
51
+ audio_segment.export(fp.name, format="wav")
52
+ return (gr.Audio(value=fp.name), "Speech generated successfully")
53
+ else:
54
+ return (None, "TTS model not available. Check logs for initialization error.")
55
+ except Exception as e:
56
+ return (None, f"Error in speech generation: {str(e)}")
57
+
58
+ def generate_sound(text):
59
+ try:
60
+ if processor is not None and model is not None:
61
+ inputs = processor(
62
+ text=[text],
63
+ padding=True,
64
+ return_tensors="pt",
65
+ ).to(device)
66
+ audio_values = model.generate(**inputs, max_new_tokens=512) # Increased tokens for longer audio
67
+ audio_data = audio_values[0, 0].cpu().numpy()
68
+
69
+ # Improve audio quality
70
+ audio_segment = AudioSegment(
71
+ audio_data.tobytes(),
72
+ frame_rate=model.config.audio_encoder.sampling_rate,
73
+ sample_width=2,
74
+ channels=1
75
+ )
76
+ audio_segment = audio_segment.compress_dynamic_range()
77
+ audio_segment = audio_segment.normalize()
78
+
79
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
80
+ audio_segment.export(fp.name, format="wav")
81
+ return (gr.Audio(value=fp.name), "Sound generated successfully")
82
+ else:
83
+ return (None, "Musicgen model not available. Check logs for initialization error.")
84
+ except Exception as e:
85
+ return (None, f"Error in sound generation: {str(e)}")
86
 
87
  # Gradio interface
88
+ with gr.Blocks() as iface:
89
+ gr.Markdown("# Enhanced Text-to-Speech and Text-to-Sound Generation Tool")
90
+
91
+ with gr.Tab("Text-to-Speech"):
92
+ text_input = gr.Textbox(label="Enter text for speech generation")
93
+ emotion_input = gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Select Emotion", value="Neutral")
94
+ speech_button = gr.Button("Generate Speech")
95
+ speech_output = gr.Audio(label="Generated Speech")
96
+ speech_message = gr.Textbox(label="Message")
97
+
98
+ with gr.Tab("Text-to-Sound"):
99
+ sound_input = gr.Textbox(label="Enter text description for sound generation")
100
+ sound_button = gr.Button("Generate Sound")
101
+ sound_output = gr.Audio(label="Generated Sound")
102
+ sound_message = gr.Textbox(label="Message")
103
+
104
+ speech_button.click(generate_speech, inputs=[text_input, emotion_input], outputs=[speech_output, speech_message])
105
+ sound_button.click(generate_sound, inputs=[sound_input], outputs=[sound_output, sound_message])
106
 
107
  iface.launch()