Spaces:

Anita-19
/

emotion-aware-tts

Runtime error

App Files Files Community

Anita-19 commited on Jan 14

Commit

15dbab2

verified ·

1 Parent(s): 5a24114

Update main.py

Browse files

Files changed (1) hide show

main.py +31 -51

main.py CHANGED Viewed

@@ -3,15 +3,15 @@ drive.mount('/content/drive')
 """Install Dependencies"""
-#pip install transformers librosa torch soundfile numba numpy TTS datasets gradio protobuf==3.20.3
 """Emotion Detection (Using Text Dataset)
 """
-#!pip install --upgrade numpy tensorflow transformers TTS
-#!pip freeze > requirements.txt
 from transformers import pipeline
@@ -43,21 +43,26 @@ tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
 def generate_emotional_speech(text, emotion):
     # Map emotion to voice modulation parameters (pitch, speed)
     emotion_settings = {
-        "neutral": {"pitch": 1.0, "speed": 1.0},     # Baseline conversational tone
-        "joy": {"pitch": 1.3, "speed": 1.2},         # Upbeat and energetic
-        "sadness": {"pitch": 0.8, "speed": 0.9},     # Subdued, slow tone
-        "anger": {"pitch": 1.6, "speed": 1.4},       # Intense and sharp
-        "fear": {"pitch": 1.2, "speed": 0.95},       # Tense and slightly slow
-        "surprise": {"pitch": 1.5, "speed": 1.3},    # Excitement with high pitch and fast speech
-        "disgust": {"pitch": 0.9, "speed": 0.95},    # Low and deliberate
-        "shame": {"pitch": 0.8, "speed": 0.85},      # Quiet, subdued tone
-    }
     # Retrieve pitch and speed based on detected emotion
     settings = emotion_settings.get(emotion, {"pitch": 1.0, "speed": 1.0})
-    # Generate speech with the TTS model, instead of directly passing speed and pitch to tts_to_file,
     # We adjust the text to simulate the effect. This is a temporary solution.
-    # You might need to fine-tune these adjustments or consider a different TTS library with better control over speech parameters.
     adjusted_text = text
     if settings['speed'] > 1.0:
         adjusted_text = adjusted_text.replace(" ", ".")  # Simulate faster speech
@@ -68,7 +73,6 @@ def generate_emotional_speech(text, emotion):
     audio_path = "output.wav"  # Or any desired filename
     tts_model.tts_to_file(text=adjusted_text, file_path=audio_path)  # Pass file_path argument
     return audio_path
 # Example usage
 emotion = "happy"
@@ -248,19 +252,7 @@ save_path = "/content/drive/My Drive/fine_tuned_tacotron2.pth"
 # Save the model's state dictionary using torch.save
 torch.save(model.state_dict(), save_path)
 """Set up the Gradio interface"""
-import librosa
-import soundfile as sf
-def adjust_pitch(audio_path, pitch_factor):
-    # Load audio
-    y, sr = librosa.load(audio_path)
-    # Adjust pitch
-    y_shifted = librosa.effects.pitch_shift(y, sr, n_steps=pitch_factor)
-    # Save adjusted audio
-    sf.write(audio_path, y_shifted, sr)
 import gradio as gr
 from transformers import pipeline
@@ -274,17 +266,14 @@ tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
 # Emotion-specific settings for pitch and speed
 emotion_settings = {
-    "neutral": {"pitch": 1.0, "speed": 1.0},
-    "joy": {"pitch": 1.3, "speed": 1.2},
     "sadness": {"pitch": 0.8, "speed": 0.9},
-    "anger": {"pitch": 1.6, "speed": 1.4},
-    "fear": {"pitch": 1.2, "speed": 0.95},
-    "surprise": {"pitch": 1.5, "speed": 1.3},
-    "disgust": {"pitch": 0.9, "speed": 0.95},
-    "shame": {"pitch": 0.8, "speed": 0.85},
 }
 # Function to process text or file input and generate audio
 def emotion_aware_tts_pipeline(input_text=None, file_input=None):
     try:
@@ -299,33 +288,24 @@ def emotion_aware_tts_pipeline(input_text=None, file_input=None):
             emotion = emotion_data['label']
             confidence = emotion_data['score']
-            # Adjust text for speed
             settings = emotion_settings.get(emotion.lower(), {"pitch": 1.0, "speed": 1.0})
-            speed = settings["speed"]
             pitch = settings["pitch"]
-            if speed > 1.0:
-                input_text = input_text.replace(" ", ". ")  # Faster speech simulation
-            elif speed < 1.0:
-                input_text = input_text.replace(" ", "... ")  # Slower speech simulation
             # Generate audio
             audio_path = "output.wav"
-            tts_model.tts_to_file(text=input_text, file_path=audio_path)
-            # Adjust pitch
-            pitch_factor = (pitch - 1.0) * 12  # Convert to semitones for librosa
-            adjust_pitch(audio_path, pitch_factor)
             return f"Detected Emotion: {emotion} (Confidence: {confidence:.2f})", audio_path
         else:
             return "Please provide input text or file", None
     except Exception as e:
         return f"Error: {str(e)}", None
 # Define Gradio interface
-interface = gr.Interface(
     fn=emotion_aware_tts_pipeline,
     inputs=[
         gr.Textbox(label="Input Text", placeholder="Enter text here"),
@@ -340,4 +320,4 @@ interface = gr.Interface(
 )
 # Launch Gradio interface
-interface.launch()

 """Install Dependencies"""
+pip install transformers librosa torch soundfile numba numpy TTS datasets gradio protobuf==3.20.3
 """Emotion Detection (Using Text Dataset)
 """
+!pip install --upgrade numpy tensorflow transformers TTS
+!pip freeze > requirements.txt
 from transformers import pipeline
 def generate_emotional_speech(text, emotion):
     # Map emotion to voice modulation parameters (pitch, speed)
     emotion_settings = {
+    "happy": {"pitch": 1.3, "speed": 1.2},       # Upbeat and energetic
+    "joy": {"pitch": 1.2, "speed": 1.1},         # Less exaggerated than 'happy'
+    "surprise": {"pitch": 1.5, "speed": 1.3},    # Excitement with high pitch and fast speech
+    "sad": {"pitch": 0.8, "speed": 0.9},         # Subdued, slow tone
+    "angry": {"pitch": 1.6, "speed": 1.4},       # Intense and sharp
+    "fear": {"pitch": 1.2, "speed": 0.95},       # Tense and slightly slow
+    "disgust": {"pitch": 0.9, "speed": 0.95},    # Low and deliberate
+    "shame": {"pitch": 0.8, "speed": 0.85},      # Quiet, subdued tone
+    "neutral": {"pitch": 1.0, "speed": 1.0},     # Baseline conversational tone
+}
     # Retrieve pitch and speed based on detected emotion
     settings = emotion_settings.get(emotion, {"pitch": 1.0, "speed": 1.0})
+    # Generate speech with the TTS model
+    # Instead of directly passing speed and pitch to tts_to_file,
     # We adjust the text to simulate the effect. This is a temporary solution.
+    # You might need to fine-tune these adjustments or consider a different TTS library
+    # with better control over speech parameters.
     adjusted_text = text
     if settings['speed'] > 1.0:
         adjusted_text = adjusted_text.replace(" ", ".")  # Simulate faster speech
     audio_path = "output.wav"  # Or any desired filename
     tts_model.tts_to_file(text=adjusted_text, file_path=audio_path)  # Pass file_path argument
     return audio_path
 # Example usage
 emotion = "happy"
 # Save the model's state dictionary using torch.save
 torch.save(model.state_dict(), save_path)
 """Set up the Gradio interface"""
 import gradio as gr
 from transformers import pipeline
 # Emotion-specific settings for pitch and speed
 emotion_settings = {
+    "joy": {"pitch": 1.2, "speed": 1.1},
     "sadness": {"pitch": 0.8, "speed": 0.9},
+    "anger": {"pitch": 1.0, "speed": 1.2},
+    "fear": {"pitch": 0.9, "speed": 1.0},
+    "surprise": {"pitch": 1.3, "speed": 1.2},
+    "neutral": {"pitch": 1.0, "speed": 1.0},
 }
 # Function to process text or file input and generate audio
 def emotion_aware_tts_pipeline(input_text=None, file_input=None):
     try:
             emotion = emotion_data['label']
             confidence = emotion_data['score']
+            # Adjust pitch and speed
             settings = emotion_settings.get(emotion.lower(), {"pitch": 1.0, "speed": 1.0})
             pitch = settings["pitch"]
+            speed = settings["speed"]
             # Generate audio
             audio_path = "output.wav"
+            tts_model.tts_to_file(text=input_text, file_path=audio_path, speed=speed, pitch=pitch)
             return f"Detected Emotion: {emotion} (Confidence: {confidence:.2f})", audio_path
         else:
             return "Please provide input text or file", None
     except Exception as e:
+        # Return error message if something goes wrong
         return f"Error: {str(e)}", None
 # Define Gradio interface
+iface = gr.Interface(
     fn=emotion_aware_tts_pipeline,
     inputs=[
         gr.Textbox(label="Input Text", placeholder="Enter text here"),
 )
 # Launch Gradio interface
+iface.launch()