capradeepgujaran commited on
Commit
4882c44
·
verified ·
1 Parent(s): 7497985

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -38
app.py CHANGED
@@ -1,30 +1,10 @@
1
  import gradio as gr
2
- import torch
3
- import torchaudio
4
- from torch import nn
5
  import numpy as np
6
  import tempfile
7
  import os
8
  from gtts import gTTS
9
  from pydub import AudioSegment
10
-
11
- # Placeholder functions for emotion evaluation
12
- # These are simplified versions and may not provide accurate results
13
- def emo2vec_sim(ref_paths, gen_paths):
14
- # Placeholder implementation
15
- return [(ref, gen, np.random.random(), np.random.random()) for ref, gen in zip(ref_paths, gen_paths)]
16
-
17
- def arousal_valence_sim(ref_paths, gen_paths):
18
- # Placeholder implementation
19
- return [(ref, gen, np.random.random(), np.random.random()) for ref, gen in zip(ref_paths, gen_paths)]
20
-
21
- class SimpleWaveformGenerator(nn.Module):
22
- def __init__(self):
23
- super().__init__()
24
- self.frequency = nn.Parameter(torch.tensor(440.0))
25
-
26
- def forward(self, t):
27
- return torch.sin(2 * np.pi * self.frequency * t)
28
 
29
  def text_to_speech_with_emotion(text, lang, emotion):
30
  try:
@@ -37,9 +17,9 @@ def text_to_speech_with_emotion(text, lang, emotion):
37
  audio = AudioSegment.from_mp3(fp.name)
38
 
39
  if emotion == "Happy":
40
- audio = audio.pitch_shift(semitones=1).speedup(playback_speed=1.1)
41
  elif emotion == "Sad":
42
- audio = audio.pitch_shift(semitones=-1).speedup(playback_speed=0.9)
43
  elif emotion == "Angry":
44
  audio = audio + 5 # Increase volume
45
  audio = audio.compress_dynamic_range(threshold=-15.0, ratio=3.0, attack=5.0, release=50.0)
@@ -52,25 +32,42 @@ def text_to_speech_with_emotion(text, lang, emotion):
52
  def generate_sound_effect(description, duration):
53
  try:
54
  sample_rate = 44100
55
- t = torch.linspace(0, duration, int(sample_rate * duration))
56
-
57
- generator = SimpleWaveformGenerator()
58
- if "high" in description.lower():
59
- generator.frequency.data = torch.tensor(880.0)
60
- elif "low" in description.lower():
61
- generator.frequency.data = torch.tensor(220.0)
62
 
63
- with torch.no_grad():
64
- audio = generator(t)
65
-
66
- audio = audio / audio.abs().max()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
69
- torchaudio.save(fp.name, audio.unsqueeze(0), sample_rate)
70
- return fp.name, "Sound effect generated successfully"
71
  except Exception as e:
72
  return None, f"Error in sound effect generation: {str(e)}"
73
 
 
 
 
 
 
 
 
74
  def evaluate_emotion(ref_audio, gen_audio, uttwise_score=False):
75
  try:
76
  ref_paths = [ref_audio]
@@ -110,7 +107,7 @@ def evaluate_emotion(ref_audio, gen_audio, uttwise_score=False):
110
 
111
  # Gradio interface
112
  with gr.Blocks() as iface:
113
- gr.Markdown("# Integrated TTS, Sound Generation, and Emotion Evaluation Tool")
114
 
115
  with gr.Tab("Text-to-Speech"):
116
  text_input = gr.Textbox(label="Enter text for speech generation")
@@ -121,7 +118,7 @@ with gr.Blocks() as iface:
121
  speech_message = gr.Textbox(label="Message")
122
 
123
  with gr.Tab("Sound Effect Generation"):
124
- sfx_input = gr.Textbox(label="Enter description for sound effect (e.g., 'high', 'low', or leave blank for middle)")
125
  sfx_duration = gr.Slider(minimum=1, maximum=10, value=3, label="Duration (seconds)")
126
  sfx_button = gr.Button("Generate Sound Effect")
127
  sfx_output = gr.Audio(label="Generated Sound Effect")
 
1
  import gradio as gr
 
 
 
2
  import numpy as np
3
  import tempfile
4
  import os
5
  from gtts import gTTS
6
  from pydub import AudioSegment
7
+ from pydub.generators import WhiteNoise, Sine
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def text_to_speech_with_emotion(text, lang, emotion):
10
  try:
 
17
  audio = AudioSegment.from_mp3(fp.name)
18
 
19
  if emotion == "Happy":
20
+ audio = audio.speedup(playback_speed=1.1)
21
  elif emotion == "Sad":
22
+ audio = audio.speedup(playback_speed=0.9)
23
  elif emotion == "Angry":
24
  audio = audio + 5 # Increase volume
25
  audio = audio.compress_dynamic_range(threshold=-15.0, ratio=3.0, attack=5.0, release=50.0)
 
32
  def generate_sound_effect(description, duration):
33
  try:
34
  sample_rate = 44100
35
+ channels = 2
36
+ duration_ms = int(duration * 1000)
 
 
 
 
 
37
 
38
+ if "rain" in description.lower():
39
+ sound = WhiteNoise().to_audio_segment(duration=duration_ms)
40
+ sound = sound.apply_gain(-10) # Make it softer
41
+ elif "car horn" in description.lower():
42
+ sound = Sine(440).to_audio_segment(duration=100) # Short beep
43
+ sound = sound.append(AudioSegment.silent(duration=50), crossfade=25)
44
+ sound = sound * 3 # Repeat the beep
45
+ elif "wind" in description.lower():
46
+ sound = WhiteNoise().to_audio_segment(duration=duration_ms)
47
+ sound = sound.apply_gain(-15) # Make it softer
48
+ sound = sound.low_pass_filter(1000) # Remove high frequencies
49
+ elif "bird" in description.lower():
50
+ sound = Sine(1000).to_audio_segment(duration=100)
51
+ sound = sound.append(Sine(1200).to_audio_segment(duration=100), crossfade=25)
52
+ sound = sound.append(AudioSegment.silent(duration=200))
53
+ sound = sound * int(duration * 2) # Repeat chirps
54
+ else:
55
+ # Default to a simple tone
56
+ sound = Sine(440).to_audio_segment(duration=duration_ms)
57
 
58
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
59
+ sound.export(fp.name, format="wav")
60
+ return fp.name, f"Sound effect generated for '{description}'"
61
  except Exception as e:
62
  return None, f"Error in sound effect generation: {str(e)}"
63
 
64
+ # Placeholder functions for emotion evaluation
65
+ def emo2vec_sim(ref_paths, gen_paths):
66
+ return [(ref, gen, np.random.random(), np.random.random()) for ref, gen in zip(ref_paths, gen_paths)]
67
+
68
+ def arousal_valence_sim(ref_paths, gen_paths):
69
+ return [(ref, gen, np.random.random(), np.random.random()) for ref, gen in zip(ref_paths, gen_paths)]
70
+
71
  def evaluate_emotion(ref_audio, gen_audio, uttwise_score=False):
72
  try:
73
  ref_paths = [ref_audio]
 
107
 
108
  # Gradio interface
109
  with gr.Blocks() as iface:
110
+ gr.Markdown("# Improved TTS and Sound Generation Tool")
111
 
112
  with gr.Tab("Text-to-Speech"):
113
  text_input = gr.Textbox(label="Enter text for speech generation")
 
118
  speech_message = gr.Textbox(label="Message")
119
 
120
  with gr.Tab("Sound Effect Generation"):
121
+ sfx_input = gr.Textbox(label="Enter description for sound effect (e.g., 'rain', 'car horn', 'wind', 'bird')")
122
  sfx_duration = gr.Slider(minimum=1, maximum=10, value=3, label="Duration (seconds)")
123
  sfx_button = gr.Button("Generate Sound Effect")
124
  sfx_output = gr.Audio(label="Generated Sound Effect")