Spaces:

Anita-19
/

emotion-aware-tts

Runtime error

App Files Files Community

Anita-19 commited on Jan 23

Commit

bbd5b58

verified ·

1 Parent(s): 8573520

Update main.py

Browse files

Files changed (1) hide show

main.py +51 -49

main.py CHANGED Viewed

@@ -1,17 +1,16 @@
 from google.colab import drive
 drive.mount('/content/drive')
-"""Install Dependencies"""
 pip install transformers librosa torch soundfile numba numpy TTS datasets gradio protobuf==3.20.3
-"""Emotion Detection (Using Text Dataset)
-"""
 !pip install --upgrade numpy tensorflow transformers TTS
-!pip freeze > requirements.txt
 from transformers import pipeline
@@ -29,7 +28,7 @@ text = "I am feeling excited today!"
 emotion, confidence = detect_emotion(text)
 print(f"Detected Emotion: {emotion}, Confidence: {confidence}")
-"""Emotion-Aware TTS (Using Tacotron 2 or Similar)"""
 import torch
 import librosa
@@ -40,17 +39,19 @@ from TTS.api import TTS  # Using Coqui TTS for simplicity
 tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
-emotion_settings = {
-    "neutral": {"pitch": 1.0, "speed": 1.0, "prosody": 0.5},  # Neutral tone
-    "joy": {"pitch": 1.3, "speed": 1.2, "prosody": 1.5},      # Upbeat, energetic
-    "sadness": {"pitch": 0.8, "speed": 0.9, "prosody": 0.8},   # Subdued, slow tone
-    "anger": {"pitch": 1.6, "speed": 1.4, "prosody": 1.8},     # Sharp, intense
-    "fear": {"pitch": 1.2, "speed": 0.95, "prosody": 1.2},     # Tense, slow
-    "surprise": {"pitch": 1.5, "speed": 1.3, "prosody": 1.4},  # Excited, high energy
-    "disgust": {"pitch": 0.9, "speed": 0.95, "prosody": 0.6},  # Low, deliberate
-    "shame": {"pitch": 0.8, "speed": 0.85, "prosody": 0.5},    # Quiet, subdued tone
-}
 import librosa
 import soundfile as sf
@@ -73,32 +74,34 @@ def adjust_speed(audio_path, speed_factor):
     # Save the adjusted audio
     sf.write(audio_path, y_speeded, sr)
-def generate_emotional_speech(text, emotion):
-    # Retrieve pitch, speed, and prosody based on detected emotion
-    settings = emotion_settings.get(emotion, {"pitch": 1.0, "speed": 1.0, "prosody": 1.0})
-    pitch = settings["pitch"]
-    speed = settings["speed"]
-    # Generate mel spectrogram with TTS
-    mel_spectrogram = tts_model.get_mel_spectrogram(text)
-    # Use HiFi-GAN vocoder to decode the spectrogram into waveform
-    audio = vocoder.decode(mel_spectrogram)
-    audio_path = "output.wav"
-    librosa.output.write_wav(audio_path, audio, sr=22050)  # Save the initial audio
-    # Apply post-processing: adjust pitch and speed
-    adjust_pitch_and_speed(audio_path, pitch_factor=pitch, speed_factor=speed)
-    return audio_path
-# Integrating Emotion Detection and TTS Pipeline
 from IPython.display import Audio, display
 def emotion_aware_tts_pipeline(text):
-    # Ensure the emotion_classifier is being accessed globally
     emotion, confidence = detect_emotion(text)
     print(f"Emotion Detected: {emotion} with Confidence: {confidence:.2f}")
@@ -210,7 +213,7 @@ tokenizer.save_pretrained(tokenizer_save_path)
 print("Model and tokenizer saved to Google Drive.")
-"""Reload the Fine-Tuned Model"""
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
@@ -228,7 +231,7 @@ tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
 print("Fine-tuned model and tokenizer loaded successfully.")
-"""Test the Reloaded Model"""
 from transformers import pipeline
@@ -240,7 +243,7 @@ text = "I feel so upset today!"
 result = emotion_classifier(text)
 print(result)
-"""Fine-tuning the TTS System"""
 from TTS.api import TTS
 from TTS.utils.audio import AudioProcessor
@@ -268,7 +271,7 @@ save_path = "/content/drive/My Drive/fine_tuned_tacotron2.pth"
 torch.save(model.state_dict(), save_path)
-"""Set up the Gradio interface"""
 import gradio as gr
 from transformers import pipeline
@@ -282,18 +285,17 @@ tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
 # Emotion-specific settings for pitch and speed
 emotion_settings = {
-    "neutral": {"pitch": 1.0, "speed": 1.0, "prosody": 0.5},  # Neutral tone
-    "joy": {"pitch": 1.3, "speed": 1.2, "prosody": 1.5},      # Upbeat, energetic
-    "sadness": {"pitch": 0.8, "speed": 0.9, "prosody": 0.8},   # Subdued, slow tone
-    "anger": {"pitch": 1.6, "speed": 1.4, "prosody": 1.8},     # Sharp, intense
-    "fear": {"pitch": 1.2, "speed": 0.95, "prosody": 1.2},     # Tense, slow
-    "surprise": {"pitch": 1.5, "speed": 1.3, "prosody": 1.4},  # Excited, high energy
-    "disgust": {"pitch": 0.9, "speed": 0.95, "prosody": 0.6},  # Low, deliberate
-    "shame": {"pitch": 0.8, "speed": 0.85, "prosody": 0.5},    # Quiet, subdued tone
 }
 # Function to process text or file input and generate audio
 def emotion_aware_tts_pipeline(input_text=None, file_input=None):
     try:

 from google.colab import drive
 drive.mount('/content/drive')
+#Install Dependencies"""
+"""
 pip install transformers librosa torch soundfile numba numpy TTS datasets gradio protobuf==3.20.3
+#Emotion Detection (Using Text Dataset)
 !pip install --upgrade numpy tensorflow transformers TTS
+!pip freeze > requirements.txt"""
 from transformers import pipeline
 emotion, confidence = detect_emotion(text)
 print(f"Detected Emotion: {emotion}, Confidence: {confidence}")
+#Emotion-Aware TTS (Using Tacotron 2 or Similar)"""
 import torch
 import librosa
 tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
+def generate_emotional_speech(text, emotion):
+    # Map emotion to voice modulation parameters (pitch, speed)
+    emotion_settings = {
+    "neutral": {"pitch": 1.0, "speed": 1.0},     # Baseline conversational tone
+    "joy": {"pitch": 1.3, "speed": 1.2},         # Upbeat and energetic
+    "sadness": {"pitch": 0.8, "speed": 0.9},     # Subdued, slow tone
+    "anger": {"pitch": 1.6, "speed": 1.4},       # Intense and sharp
+    "fear": {"pitch": 1.2, "speed": 0.95},       # Tense and slightly slow
+    "surprise": {"pitch": 1.5, "speed": 1.3},    # Excitement with high pitch and fast speech
+    "disgust": {"pitch": 0.9, "speed": 0.95},    # Low and deliberate
+    "shame": {"pitch": 0.8, "speed": 0.85},      # Quiet, subdued tone
+}
 import librosa
 import soundfile as sf
     # Save the adjusted audio
     sf.write(audio_path, y_speeded, sr)
+    # Retrieve pitch and speed based on detected emotion
+    settings = emotion_settings.get(emotion, {"pitch": 1.0, "speed": 1.0})
+    # Generate speech with the TTS model
+    # Instead of directly passing speed and pitch to tts_to_file,
+    # We adjust the text to simulate the effect. This is a temporary solution.
+    # You might need to fine-tune these adjustments or consider a different TTS library
+    # with better control over speech parameters.
+    adjusted_text = text
+    if settings['speed'] > 1.0:
+        adjusted_text = adjusted_text.replace(" ", ".")  # Simulate faster speech
+    elif settings['speed'] < 1.0:
+        adjusted_text = adjusted_text.replace(" ", "...")  # Simulate slower speech
+    # Explicitly specify the output path
+    audio_path = "output.wav"  # Or any desired filename
+    tts_model.tts_to_file(text=adjusted_text, file_path=audio_path)  # Pass file_path argument
+    return audio_path
+# Example usage
+emotion = "happy"
+output_audio = generate_emotional_speech("Welcome to the smart library!", emotion)
+print(f"Generated Speech Saved At: {output_audio}")
+"""Integrating the Workflow"""
 from IPython.display import Audio, display
 def emotion_aware_tts_pipeline(text):
     emotion, confidence = detect_emotion(text)
     print(f"Emotion Detected: {emotion} with Confidence: {confidence:.2f}")
 print("Model and tokenizer saved to Google Drive.")
+#Reload the Fine-Tuned Model"""
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 print("Fine-tuned model and tokenizer loaded successfully.")
+#Test the Reloaded Model"""
 from transformers import pipeline
 result = emotion_classifier(text)
 print(result)
+#Fine-tuning the TTS System"""
 from TTS.api import TTS
 from TTS.utils.audio import AudioProcessor
 torch.save(model.state_dict(), save_path)
+#Set up the Gradio interface
 import gradio as gr
 from transformers import pipeline
 # Emotion-specific settings for pitch and speed
 emotion_settings = {
+    "neutral": {"pitch": 1.0, "speed": 1.0},
+    "joy": {"pitch": 1.3, "speed": 1.2},
+    "sadness": {"pitch": 0.8, "speed": 0.9},
+    "anger": {"pitch": 1.6, "speed": 1.4},
+    "fear": {"pitch": 1.2, "speed": 0.95},
+    "surprise": {"pitch": 1.5, "speed": 1.3},
+    "disgust": {"pitch": 0.9, "speed": 0.95},
+    "shame": {"pitch": 0.8, "speed": 0.85},
 }
 # Function to process text or file input and generate audio
 def emotion_aware_tts_pipeline(input_text=None, file_input=None):
     try: