capradeepgujaran commited on
Commit
82519df
·
verified ·
1 Parent(s): 2ae3aa9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -50
app.py CHANGED
@@ -1,13 +1,12 @@
1
  import gradio as gr
2
  from gtts import gTTS
3
- import numpy as np
4
- import soundfile as sf
5
- from pydub import AudioSegment
6
  import io
7
  import tempfile
8
- import os
 
9
 
10
  def text_to_speech_with_emotion(text, emotion, language='en'):
 
11
  tts = gTTS(text=text, lang=language, slow=False)
12
 
13
  with io.BytesIO() as fp:
@@ -15,69 +14,45 @@ def text_to_speech_with_emotion(text, emotion, language='en'):
15
  fp.seek(0)
16
  audio = AudioSegment.from_mp3(fp)
17
 
18
- # Adjust audio based on emotion (more subtle adjustments)
19
  if emotion == "Happy":
20
- audio = audio.speedup(playback_speed=1.1)
 
21
  elif emotion == "Sad":
22
- audio = audio.speedup(playback_speed=0.9)
 
23
  elif emotion == "Angry":
24
- audio = audio + 2 # Increase volume slightly
 
 
25
 
26
- # Apply a high-pass filter to reduce low-frequency noise
27
- audio = audio.high_pass_filter(80)
 
28
 
29
  return audio
30
 
31
- def generate_subtle_emotion_sound(emotion, duration):
32
- sr = 44100 # Higher sample rate for better quality
33
- t = np.linspace(0, duration, int(sr * duration), False)
34
-
35
- if emotion == "Happy":
36
- freq = 440 # A4 note
37
- audio = np.sin(2 * np.pi * freq * t) * np.exp(-3 * t)
38
- elif emotion == "Sad":
39
- freq = 294 # D4 note
40
- audio = np.sin(2 * np.pi * freq * t) * np.exp(-2 * t)
41
- elif emotion == "Angry":
42
- freq = 392 # G4 note
43
- audio = np.sign(np.sin(2 * np.pi * freq * t)) * np.exp(-4 * t)
44
- else: # Neutral
45
- freq = 329 # E4 note
46
- audio = np.sin(2 * np.pi * freq * t) * np.exp(-3 * t)
47
-
48
- audio = audio / np.max(np.abs(audio)) # Normalize
49
- return AudioSegment(audio.astype(np.float32).tobytes(),
50
- frame_rate=sr, sample_width=4, channels=1)
51
-
52
- def generate_dialogue_and_sound(text, emotion, language):
53
- speech = text_to_speech_with_emotion(text, emotion, language)
54
- sound = generate_subtle_emotion_sound(emotion, 1) # 1 second of subtle emotion sound
55
-
56
- # Reduce the volume of the sound effect
57
- sound = sound - 12 # Reduce volume by 12 dB
58
-
59
- # Combine speech and sound
60
- combined = speech.overlay(sound, position=0, gain_during_overlay=-12)
61
 
62
- # Apply some final processing
63
- combined = combined.compress_dynamic_range() # Smooth out volume differences
64
- combined = combined.normalize() # Normalize the final audio
65
 
66
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
67
- combined.export(fp.name, format="mp3", bitrate="192k") # Higher bitrate for better quality
68
  return fp.name
69
 
70
  # Gradio interface
71
  iface = gr.Interface(
72
- fn=generate_dialogue_and_sound,
73
  inputs=[
74
- gr.Textbox(label="Enter dialogue text"),
75
- gr.Radio(["Happy", "Sad", "Angry", "Neutral"], label="Emotion", value="Neutral"),
76
  gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
77
  ],
78
- outputs=gr.Audio(label="Generated Dialogue with Emotion"),
79
- title="Improved Emotional Dialogue Generator",
80
- description="Generate clear dialogues with subtle emotional cues."
81
  )
82
 
83
  iface.launch()
 
1
  import gradio as gr
2
  from gtts import gTTS
 
 
 
3
  import io
4
  import tempfile
5
+ from pydub import AudioSegment
6
+ import numpy as np
7
 
8
  def text_to_speech_with_emotion(text, emotion, language='en'):
9
+ # Generate base speech
10
  tts = gTTS(text=text, lang=language, slow=False)
11
 
12
  with io.BytesIO() as fp:
 
14
  fp.seek(0)
15
  audio = AudioSegment.from_mp3(fp)
16
 
17
+ # Adjust audio based on emotion
18
  if emotion == "Happy":
19
+ audio = audio.speedup(playback_speed=1.15)
20
+ audio = audio.pitch_shift(semitones=1)
21
  elif emotion == "Sad":
22
+ audio = audio.speedup(playback_speed=0.85)
23
+ audio = audio.pitch_shift(semitones=-1)
24
  elif emotion == "Angry":
25
+ audio = audio.speedup(playback_speed=1.1)
26
+ audio = audio + 3 # Increase volume slightly
27
+ # Neutral emotion remains unchanged
28
 
29
+ # Apply some subtle enhancements
30
+ audio = audio.compress_dynamic_range(threshold=-15, ratio=2.0, attack=5, release=50)
31
+ audio = audio.high_pass_filter(80) # Remove very low frequencies
32
 
33
  return audio
34
 
35
+ def generate_emotional_speech(text, emotion, language):
36
+ audio = text_to_speech_with_emotion(text, emotion, language)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # Normalize the final audio
39
+ audio = audio.normalize()
 
40
 
41
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
42
+ audio.export(fp.name, format="mp3", bitrate="192k") # Higher bitrate for better quality
43
  return fp.name
44
 
45
  # Gradio interface
46
  iface = gr.Interface(
47
+ fn=generate_emotional_speech,
48
  inputs=[
49
+ gr.Textbox(label="Enter text for speech"),
50
+ gr.Radio(["Neutral", "Happy", "Sad", "Angry"], label="Emotion", value="Neutral"),
51
  gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
52
  ],
53
+ outputs=gr.Audio(label="Generated Emotional Speech"),
54
+ title="Clean Emotional Text-to-Speech Generator",
55
+ description="Generate clean speech with emotional variations without background sounds."
56
  )
57
 
58
  iface.launch()