capradeepgujaran commited on
Commit
427214b
·
verified ·
1 Parent(s): ab2d014

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -77
app.py CHANGED
@@ -1,107 +1,94 @@
1
  import gradio as gr
2
- from transformers import AutoProcessor, MusicgenForConditionalGeneration
3
  import torch
4
- from TTS.api import TTS
5
- import scipy
6
  import numpy as np
7
- from pydub import AudioSegment
8
- import io
9
  import tempfile
 
10
 
11
- # Initialize TTS model
12
  try:
13
- device = "cuda" if torch.cuda.is_available() else "cpu"
14
- tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(device)
15
  except Exception as e:
16
- print(f"Error initializing TTS model: {e}")
17
- tts = None
18
 
19
- # Initialize Musicgen model for sound generation
20
- try:
21
- processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
22
- model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
23
- model.to(device)
24
- except Exception as e:
25
- print(f"Error initializing Musicgen model: {e}")
26
- processor = None
27
- model = None
28
 
29
- def apply_emotion(audio, emotion):
30
- audio_segment = AudioSegment(audio.tobytes(), frame_rate=22050, sample_width=2, channels=1)
 
 
 
31
  if emotion == "Happy":
32
- audio_segment = audio_segment.pitch_shift(1).speedup(playback_speed=1.1)
 
33
  elif emotion == "Sad":
34
- audio_segment = audio_segment.pitch_shift(-1).speedup(playback_speed=0.9)
 
35
  elif emotion == "Angry":
36
- audio_segment = audio_segment.pitch_shift(0.5).speedup(playback_speed=1.05)
37
- return np.array(audio_segment.get_array_of_samples())
38
-
39
- def generate_speech(text, emotion):
40
- try:
41
- if tts is not None:
42
- speech = tts.tts(text=text)
43
- speech_with_emotion = apply_emotion(speech, emotion)
44
-
45
- # Improve audio quality
46
- audio_segment = AudioSegment(speech_with_emotion.tobytes(), frame_rate=22050, sample_width=2, channels=1)
47
- audio_segment = audio_segment.compress_dynamic_range()
48
- audio_segment = audio_segment.normalize()
49
-
50
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
51
- audio_segment.export(fp.name, format="wav")
52
- return (gr.Audio(value=fp.name), "Speech generated successfully")
53
- else:
54
- return (None, "TTS model not available. Check logs for initialization error.")
55
- except Exception as e:
56
- return (None, f"Error in speech generation: {str(e)}")
57
 
58
- def generate_sound(text):
59
- try:
60
- if processor is not None and model is not None:
61
- inputs = processor(
62
- text=[text],
63
- padding=True,
64
- return_tensors="pt",
65
- ).to(device)
66
- audio_values = model.generate(**inputs, max_new_tokens=512) # Increased tokens for longer audio
67
- audio_data = audio_values[0, 0].cpu().numpy()
68
-
69
- # Improve audio quality
70
- audio_segment = AudioSegment(
71
- audio_data.tobytes(),
72
- frame_rate=model.config.audio_encoder.sampling_rate,
73
- sample_width=2,
74
- channels=1
75
- )
76
- audio_segment = audio_segment.compress_dynamic_range()
77
- audio_segment = audio_segment.normalize()
78
-
79
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
80
- audio_segment.export(fp.name, format="wav")
81
- return (gr.Audio(value=fp.name), "Sound generated successfully")
82
- else:
83
- return (None, "Musicgen model not available. Check logs for initialization error.")
84
- except Exception as e:
85
- return (None, f"Error in sound generation: {str(e)}")
86
 
87
  # Gradio interface
88
  with gr.Blocks() as iface:
89
- gr.Markdown("# Enhanced Text-to-Speech and Text-to-Sound Generation Tool")
90
 
91
  with gr.Tab("Text-to-Speech"):
92
  text_input = gr.Textbox(label="Enter text for speech generation")
93
  emotion_input = gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Select Emotion", value="Neutral")
 
94
  speech_button = gr.Button("Generate Speech")
95
  speech_output = gr.Audio(label="Generated Speech")
96
  speech_message = gr.Textbox(label="Message")
97
 
98
- with gr.Tab("Text-to-Sound"):
99
- sound_input = gr.Textbox(label="Enter text description for sound generation")
100
  sound_button = gr.Button("Generate Sound")
101
  sound_output = gr.Audio(label="Generated Sound")
102
  sound_message = gr.Textbox(label="Message")
103
 
104
- speech_button.click(generate_speech, inputs=[text_input, emotion_input], outputs=[speech_output, speech_message])
105
- sound_button.click(generate_sound, inputs=[sound_input], outputs=[sound_output, sound_message])
 
 
 
 
106
 
107
  iface.launch()
 
1
  import gradio as gr
2
+ import pyttsx3
3
  import torch
4
+ import torchaudio
5
+ from torch import nn
6
  import numpy as np
 
 
7
  import tempfile
8
+ import os
9
 
10
+ # Initialize TTS engine
11
  try:
12
+ engine = pyttsx3.init()
 
13
  except Exception as e:
14
+ print(f"Error initializing TTS engine: {e}")
15
+ engine = None
16
 
17
+ class SimpleWaveformGenerator(nn.Module):
18
+ def __init__(self):
19
+ super().__init__()
20
+ self.frequency = nn.Parameter(torch.tensor(440.0))
21
+
22
+ def forward(self, t):
23
+ return torch.sin(2 * np.pi * self.frequency * t)
 
 
24
 
25
+ def text_to_speech_with_emotion(text, emotion, lang='en'):
26
+ if engine is None:
27
+ return None, "TTS engine not initialized correctly."
28
+
29
+ # Set voice properties based on emotion
30
  if emotion == "Happy":
31
+ engine.setProperty('rate', 175)
32
+ engine.setProperty('pitch', 75)
33
  elif emotion == "Sad":
34
+ engine.setProperty('rate', 125)
35
+ engine.setProperty('pitch', 25)
36
  elif emotion == "Angry":
37
+ engine.setProperty('rate', 150)
38
+ engine.setProperty('pitch', 100)
39
+ else: # Neutral
40
+ engine.setProperty('rate', 150)
41
+ engine.setProperty('pitch', 50)
42
+
43
+ # Generate speech
44
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
45
+ engine.save_to_file(text, fp.name)
46
+ engine.runAndWait()
47
+ return fp.name, "Speech generated successfully"
 
 
 
 
 
 
 
 
 
 
48
 
49
+ def generate_sound(description):
50
+ duration = 3 # seconds
51
+ sample_rate = 44100
52
+ t = torch.linspace(0, duration, int(sample_rate * duration))
53
+
54
+ generator = SimpleWaveformGenerator()
55
+ if "high" in description.lower():
56
+ generator.frequency.data = torch.tensor(880.0)
57
+ elif "low" in description.lower():
58
+ generator.frequency.data = torch.tensor(220.0)
59
+
60
+ with torch.no_grad():
61
+ audio = generator(t)
62
+
63
+ audio = audio / audio.abs().max()
64
+
65
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
66
+ torchaudio.save(fp.name, audio.unsqueeze(0), sample_rate)
67
+ return fp.name, "Sound generated successfully"
 
 
 
 
 
 
 
 
 
68
 
69
  # Gradio interface
70
  with gr.Blocks() as iface:
71
+ gr.Markdown("# Reliable Text-to-Speech and Sound Generation Tool")
72
 
73
  with gr.Tab("Text-to-Speech"):
74
  text_input = gr.Textbox(label="Enter text for speech generation")
75
  emotion_input = gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Select Emotion", value="Neutral")
76
+ lang_input = gr.Dropdown(["en"], label="Select Language", value="en")
77
  speech_button = gr.Button("Generate Speech")
78
  speech_output = gr.Audio(label="Generated Speech")
79
  speech_message = gr.Textbox(label="Message")
80
 
81
+ with gr.Tab("Sound Generation"):
82
+ sound_input = gr.Textbox(label="Enter sound description (e.g., 'high', 'low', or leave blank for middle)")
83
  sound_button = gr.Button("Generate Sound")
84
  sound_output = gr.Audio(label="Generated Sound")
85
  sound_message = gr.Textbox(label="Message")
86
 
87
+ speech_button.click(text_to_speech_with_emotion,
88
+ inputs=[text_input, emotion_input, lang_input],
89
+ outputs=[speech_output, speech_message])
90
+ sound_button.click(generate_sound,
91
+ inputs=[sound_input],
92
+ outputs=[sound_output, sound_message])
93
 
94
  iface.launch()