Spaces:

Anita-19
/

emotion-aware-tts

Running

App Files Files Community

Anita-19 commited on Jan 23

Commit

8573520

verified ·

1 Parent(s): 8b40da9

Update main.py

Browse files

Files changed (1) hide show

main.py +134 -52

main.py CHANGED Viewed

@@ -1,19 +1,24 @@
 from google.colab import drive
 drive.mount('/content/drive')
-# Install Dependencies
-!pip install transformers librosa torch soundfile numba numpy TTS datasets gradio protobuf==3.20.3
-# Emotion Detection (Using Text Dataset)
 !pip install --upgrade numpy tensorflow transformers TTS
 from transformers import pipeline
-# Initialize the emotion classifier pipeline globally
 emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion")
 def detect_emotion(text):
-    # Ensure the emotion_classifier is used properly
     result = emotion_classifier(text)
     emotion = result[0]['label']
     confidence = result[0]['score']
@@ -24,7 +29,8 @@ text = "I am feeling excited today!"
 emotion, confidence = detect_emotion(text)
 print(f"Detected Emotion: {emotion}, Confidence: {confidence}")
-# Emotion-Aware TTS (Using Tacotron 2 or Similar)
 import torch
 import librosa
 import numpy as np
@@ -33,14 +39,7 @@ from TTS.api import TTS  # Using Coqui TTS for simplicity
 # Load TTS model and vocoder automatically during initialization
 tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
-# HiFi-GAN Vocoder (Ensure you have the model or download it)
-from TTS.utils.generic_utils import download_model
-from TTS.vocoder.hifigan import HIFIGAN
-vocoder_model = download_model("hifigan_ljspeech")
-vocoder = HIFIGAN(vocoder_model)
-# Emotion-specific settings for pitch, speed, and prosody
 emotion_settings = {
     "neutral": {"pitch": 1.0, "speed": 1.0, "prosody": 0.5},  # Neutral tone
     "joy": {"pitch": 1.3, "speed": 1.2, "prosody": 1.5},      # Upbeat, energetic
@@ -52,18 +51,29 @@ emotion_settings = {
     "shame": {"pitch": 0.8, "speed": 0.85, "prosody": 0.5},    # Quiet, subdued tone
 }
-def adjust_pitch_and_speed(audio_path, pitch_factor, speed_factor):
-    # Load audio file
-    y, sr = librosa.load(audio_path)
     # Adjust pitch
-    y_pitch = librosa.effects.pitch_shift(y, sr, n_steps=pitch_factor)
-    # Adjust speed
-    y_speed = librosa.effects.time_stretch(y_pitch, speed_factor)
     # Save the adjusted audio
-    sf.write(audio_path, y_speed, sr)
 def generate_emotional_speech(text, emotion):
     # Retrieve pitch, speed, and prosody based on detected emotion
@@ -101,9 +111,12 @@ def emotion_aware_tts_pipeline(text):
 # Example usage
 emotion_aware_tts_pipeline("I can’t stooop smiiiling, everything feels perrrfect!")
-# Fine-tuning the Emotion Detection Model (if needed)
 import os
 os.environ["WANDB_DISABLED"] = "true"
 from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
 from datasets import load_dataset
@@ -115,6 +128,9 @@ tokenizer = AutoTokenizer.from_pretrained("bhadresh-savani/distilbert-base-uncas
 # Define a function to map emotion labels to integers
 def map_emotion_to_int(example):
     emotion_mapping = {
         "neutral": 0,
         "joy": 1,
@@ -125,18 +141,27 @@ def map_emotion_to_int(example):
         "disgust": 6,
         "shame": 7,
     }
-    example['label'] = emotion_mapping.get(example['label'], -1)
     return example
 def preprocess_data(example):
-    return tokenizer(example['text'], truncation=True, padding=True, max_length=512)
 # Apply emotion mapping before tokenization
 dataset = dataset.map(map_emotion_to_int, batched=False)
-dataset = dataset.filter(lambda example: example['label'] != -1)
 tokenized_dataset = dataset.map(preprocess_data, batched=True, remove_columns=['text'])
 # Load model
 model = AutoModelForSequenceClassification.from_pretrained(
     "bhadresh-savani/distilbert-base-uncased-emotion",
     num_labels=8,
@@ -145,24 +170,25 @@ model = AutoModelForSequenceClassification.from_pretrained(
 # Training arguments
 training_args = TrainingArguments(
-    output_dir="./results",
-    evaluation_strategy="epoch",
-    learning_rate=5e-5,
-    per_device_train_batch_size=16,
-    gradient_accumulation_steps=4,
-    num_train_epochs=5,
-    weight_decay=0.01,
-    save_strategy="epoch",
-    logging_dir="./logs",
-    logging_steps=100,
-    warmup_steps=500,
-    save_total_limit=3,
-    fp16=True,
-    load_best_model_at_end=True,
-    metric_for_best_model="eval_loss",
-    greater_is_better=False,
 )
 # Train model
 trainer = Trainer(
     model=model,
@@ -174,15 +200,18 @@ trainer = Trainer(
 trainer.train()
-# Save the model and tokenizer
 model_save_path = "/content/drive/My Drive/emotion_detection_model1"
 tokenizer_save_path = "/content/drive/My Drive/emotion_detection_model1"
 model.save_pretrained(model_save_path)
 tokenizer.save_pretrained(tokenizer_save_path)
 print("Model and tokenizer saved to Google Drive.")
-# Reload the Fine-Tuned Model
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 # Mount Google Drive
@@ -199,7 +228,8 @@ tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
 print("Fine-tuned model and tokenizer loaded successfully.")
-# Test the Reloaded Model
 from transformers import pipeline
 # Create a text classification pipeline with the loaded model
@@ -210,10 +240,62 @@ text = "I feel so upset today!"
 result = emotion_classifier(text)
 print(result)
-# Set up the Gradio interface
 import gradio as gr
-def emotion_aware_tts_pipeline_gradio(input_text=None, file_input=None):
     try:
         # Get text from input or file
         if file_input:
@@ -233,11 +315,9 @@ def emotion_aware_tts_pipeline_gradio(input_text=None, file_input=None):
             # Generate audio
             audio_path = "output.wav"
-            mel_spectrogram = tts_model.get_mel_spectrogram(input_text)
-            audio = vocoder.decode(mel_spectrogram)
-            # Post-processing: adjust pitch and speed
-            adjust_pitch_and_speed(audio_path, pitch_factor=pitch, speed_factor=speed)
             return f"Detected Emotion: {emotion} (Confidence: {confidence:.2f})", audio_path
         else:
@@ -245,9 +325,11 @@ def emotion_aware_tts_pipeline_gradio(input_text=None, file_input=None):
     except Exception as e:
         return f"Error: {str(e)}", None
 # Define Gradio interface
 iface = gr.Interface(
-    fn=emotion_aware_tts_pipeline_gradio,
     inputs=[
         gr.Textbox(label="Input Text", placeholder="Enter text here"),
         gr.File(label="Upload a Text File")

 from google.colab import drive
 drive.mount('/content/drive')
+"""Install Dependencies"""
+pip install transformers librosa torch soundfile numba numpy TTS datasets gradio protobuf==3.20.3
+"""Emotion Detection (Using Text Dataset)
+"""
 !pip install --upgrade numpy tensorflow transformers TTS
+!pip freeze > requirements.txt
 from transformers import pipeline
+# Load pre-trained model for emotion detection
 emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion")
 def detect_emotion(text):
     result = emotion_classifier(text)
     emotion = result[0]['label']
     confidence = result[0]['score']
 emotion, confidence = detect_emotion(text)
 print(f"Detected Emotion: {emotion}, Confidence: {confidence}")
+"""Emotion-Aware TTS (Using Tacotron 2 or Similar)"""
 import torch
 import librosa
 import numpy as np
 # Load TTS model and vocoder automatically during initialization
 tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
 emotion_settings = {
     "neutral": {"pitch": 1.0, "speed": 1.0, "prosody": 0.5},  # Neutral tone
     "joy": {"pitch": 1.3, "speed": 1.2, "prosody": 1.5},      # Upbeat, energetic
     "shame": {"pitch": 0.8, "speed": 0.85, "prosody": 0.5},    # Quiet, subdued tone
 }
+import librosa
+import soundfile as sf
+def adjust_pitch(audio_path, pitch_factor):
+    # Load audio
+    y, sr = librosa.load(audio_path)
     # Adjust pitch
+    y_shifted = librosa.effects.pitch_shift(y, sr, n_steps=pitch_factor)
+    # Save adjusted audio
+    sf.write(audio_path, y_shifted, sr)
+def adjust_speed(audio_path, speed_factor):
+    # Load the audio file
+    y, sr = librosa.load(audio_path)
+    # Adjust the speed (this alters the duration of the audio)
+    y_speeded = librosa.effects.time_stretch(y, speed_factor)
     # Save the adjusted audio
+    sf.write(audio_path, y_speeded, sr)
 def generate_emotional_speech(text, emotion):
     # Retrieve pitch, speed, and prosody based on detected emotion
 # Example usage
 emotion_aware_tts_pipeline("I can’t stooop smiiiling, everything feels perrrfect!")
+"""Fine-tuning the Emotion Detection Model"""
 import os
 os.environ["WANDB_DISABLED"] = "true"
+from google.colab import drive
 from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
 from datasets import load_dataset
 # Define a function to map emotion labels to integers
 def map_emotion_to_int(example):
+    # Assuming your dataset has an 'emotion' column with string labels
+    # Replace this with your actual emotion labels and their corresponding integers
+    # *Change 'emotion' to the actual column name in your dataset*
     emotion_mapping = {
         "neutral": 0,
         "joy": 1,
         "disgust": 6,
         "shame": 7,
     }
+    # Assuming your emotion column is named 'label'
+    # example['label'] = emotion_mapping[example['emotion']] # Create a new 'label' column with integer values
+    example['label'] = emotion_mapping.get(example['label'], -1) # If the label is not in the emotion mapping then we set it to -1. We can later filter these examples out
     return example
 def preprocess_data(example):
+    return tokenizer(example['text'], truncation=True, padding=True, max_length=512) # Added max_length for consistency
 # Apply emotion mapping before tokenization
 dataset = dataset.map(map_emotion_to_int, batched=False)
+# *Keep the 'label' column for training. Only remove 'text'*
+# Filter out examples with labels not in emotion_mapping (-1)
+dataset = dataset.filter(lambda example: example['label'] != -1) # Filter out examples with label -1
 tokenized_dataset = dataset.map(preprocess_data, batched=True, remove_columns=['text'])
 # Load model
+# model = AutoModelForSequenceClassification.from_pretrained("bhadresh-savani/distilbert-base-uncased-emotion", num_labels=8)
+# Load model with ignore_mismatched_sizes=True
 model = AutoModelForSequenceClassification.from_pretrained(
     "bhadresh-savani/distilbert-base-uncased-emotion",
     num_labels=8,
 # Training arguments
 training_args = TrainingArguments(
+    output_dir="./results",    # Directory for model checkpoints and logs
+    evaluation_strategy="epoch",  # Evaluate after every epoch
+    learning_rate=5e-5,  # Start with 5e-5 (slightly higher than default 2e-5)
+    per_device_train_batch_size=16,  # Use 16 for balance between memory usage and training speed
+    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
+    num_train_epochs=5,  # Train for 4-5 epochs (typically enough for fine-tuning)
+    weight_decay=0.01,  # Regularization to avoid overfitting
+    save_strategy="epoch",  # Save checkpoints after each epoch
+    logging_dir="./logs",  # Directory for logging
+    logging_steps=100,  # Log every 100 steps
+    warmup_steps=500,  # Gradual learning rate increase for the first 500 steps
+    save_total_limit=3,  # Keep only the last 3 checkpoints
+    fp16=True,  # Enable mixed precision for faster training if GPU supports it
+    load_best_model_at_end=True,  # Load the best model at the end of training
+    metric_for_best_model="eval_loss",  # Use evaluation loss to select the best model
+    greater_is_better=False,  # Lower loss is better
 )
 # Train model
 trainer = Trainer(
     model=model,
 trainer.train()
+# Save the model and tokenizer to Google Drive
 model_save_path = "/content/drive/My Drive/emotion_detection_model1"
 tokenizer_save_path = "/content/drive/My Drive/emotion_detection_model1"
+# Save the fine-tuned model
 model.save_pretrained(model_save_path)
 tokenizer.save_pretrained(tokenizer_save_path)
 print("Model and tokenizer saved to Google Drive.")
+"""Reload the Fine-Tuned Model"""
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 # Mount Google Drive
 print("Fine-tuned model and tokenizer loaded successfully.")
+"""Test the Reloaded Model"""
 from transformers import pipeline
 # Create a text classification pipeline with the loaded model
 result = emotion_classifier(text)
 print(result)
+"""Fine-tuning the TTS System"""
+from TTS.api import TTS
+from TTS.utils.audio import AudioProcessor
+from TTS.tts.models.tacotron2 import Tacotron2
+import torch
+# Load pre-trained model
+#model = Tacotron2.load_model("tts_models/en/ljspeech/tacotron2-DDC")
+tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC") # Use TTS for model loading
+# Access the Tacotron2 model from the TTS object
+model = tts.synthesizer.tts_model
+# Fine-tuning parameters
+model.config.dataset_path = "/content/drive/MyDrive/RAVDESS"
+model.config.num_epochs = 10
+# Train
+model.train()
+# Define the save path on Google Drive
+save_path = "/content/drive/My Drive/fine_tuned_tacotron2.pth"
+# Save the model's state dictionary using torch.save
+torch.save(model.state_dict(), save_path)
+"""Set up the Gradio interface"""
 import gradio as gr
+from transformers import pipeline
+from TTS.api import TTS
+# Load pre-trained emotion detection model
+emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion")
+# Load TTS model
+tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
+# Emotion-specific settings for pitch and speed
+emotion_settings = {
+    "neutral": {"pitch": 1.0, "speed": 1.0, "prosody": 0.5},  # Neutral tone
+    "joy": {"pitch": 1.3, "speed": 1.2, "prosody": 1.5},      # Upbeat, energetic
+    "sadness": {"pitch": 0.8, "speed": 0.9, "prosody": 0.8},   # Subdued, slow tone
+    "anger": {"pitch": 1.6, "speed": 1.4, "prosody": 1.8},     # Sharp, intense
+    "fear": {"pitch": 1.2, "speed": 0.95, "prosody": 1.2},     # Tense, slow
+    "surprise": {"pitch": 1.5, "speed": 1.3, "prosody": 1.4},  # Excited, high energy
+    "disgust": {"pitch": 0.9, "speed": 0.95, "prosody": 0.6},  # Low, deliberate
+    "shame": {"pitch": 0.8, "speed": 0.85, "prosody": 0.5},    # Quiet, subdued tone
+}
+# Function to process text or file input and generate audio
+def emotion_aware_tts_pipeline(input_text=None, file_input=None):
     try:
         # Get text from input or file
         if file_input:
             # Generate audio
             audio_path = "output.wav"
+            tts_model.tts_to_file(text=input_text, file_path=audio_path, speed=speed, pitch=pitch)
             return f"Detected Emotion: {emotion} (Confidence: {confidence:.2f})", audio_path
         else:
     except Exception as e:
         return f"Error: {str(e)}", None
 # Define Gradio interface
 iface = gr.Interface(
+    fn=emotion_aware_tts_pipeline,
     inputs=[
         gr.Textbox(label="Input Text", placeholder="Enter text here"),
         gr.File(label="Upload a Text File")