Spaces:

Anita-19
/

emotion-aware-tts

Running

App Files Files Community

Anita-19 commited on Jan 23

Commit

afe9f6d

verified ·

1 Parent(s): 4635cfc

Update main.py

Browse files

Files changed (1) hide show

main.py +75 -162

main.py CHANGED Viewed

@@ -1,18 +1,12 @@
 from google.colab import drive
 drive.mount('/content/drive')
-"""Install Dependencies"""
-pip install transformers librosa torch soundfile numba numpy TTS datasets gradio protobuf==3.20.3
-"""Emotion Detection (Using Text Dataset)
-"""
 !pip install --upgrade numpy tensorflow transformers TTS
-!pip freeze > requirements.txt
 from transformers import pipeline
 # Load pre-trained model for emotion detection
@@ -29,8 +23,7 @@ text = "I am feeling excited today!"
 emotion, confidence = detect_emotion(text)
 print(f"Detected Emotion: {emotion}, Confidence: {confidence}")
-"""Emotion-Aware TTS (Using Tacotron 2 or Similar)"""
 import torch
 import librosa
 import numpy as np
@@ -39,67 +32,58 @@ from TTS.api import TTS  # Using Coqui TTS for simplicity
 # Load TTS model and vocoder automatically during initialization
 tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
-def generate_emotional_speech(text, emotion):
-    # Map emotion to voice modulation parameters (pitch, speed)
-    emotion_settings = {
-    "neutral": {"pitch": 1.0, "speed": 1.0},     # Baseline conversational tone
-    "joy": {"pitch": 1.3, "speed": 1.2},         # Upbeat and energetic
-    "sadness": {"pitch": 0.8, "speed": 0.9},     # Subdued, slow tone
-    "anger": {"pitch": 1.6, "speed": 1.4},       # Intense and sharp
-    "fear": {"pitch": 1.2, "speed": 0.95},       # Tense and slightly slow
-    "surprise": {"pitch": 1.5, "speed": 1.3},    # Excitement with high pitch and fast speech
-    "disgust": {"pitch": 0.9, "speed": 0.95},    # Low and deliberate
-    "shame": {"pitch": 0.8, "speed": 0.85},      # Quiet, subdued tone
 }
-import librosa
-import soundfile as sf
-def adjust_pitch(audio_path, pitch_factor):
-    # Load audio
     y, sr = librosa.load(audio_path)
     # Adjust pitch
-    y_shifted = librosa.effects.pitch_shift(y, sr, n_steps=pitch_factor)
-    # Save adjusted audio
-    sf.write(audio_path, y_shifted, sr)
-def adjust_speed(audio_path, speed_factor):
-    # Load the audio file
-    y, sr = librosa.load(audio_path)
-    # Adjust the speed (this alters the duration of the audio)
-    y_speeded = librosa.effects.time_stretch(y, speed_factor)
-    # Save the adjusted audio
-    sf.write(audio_path, y_speeded, sr)
-    # Retrieve pitch and speed based on detected emotion
-    settings = emotion_settings.get(emotion, {"pitch": 1.0, "speed": 1.0})
-    # Generate speech with the TTS model
-    # Instead of directly passing speed and pitch to tts_to_file,
-    # We adjust the text to simulate the effect. This is a temporary solution.
-    # You might need to fine-tune these adjustments or consider a different TTS library
-    # with better control over speech parameters.
-    adjusted_text = text
-    if settings['speed'] > 1.0:
-        adjusted_text = adjusted_text.replace(" ", ".")  # Simulate faster speech
-    elif settings['speed'] < 1.0:
-        adjusted_text = adjusted_text.replace(" ", "...")  # Simulate slower speech
-    # Explicitly specify the output path
-    audio_path = "output.wav"  # Or any desired filename
-    tts_model.tts_to_file(text=adjusted_text, file_path=audio_path)  # Pass file_path argument
-    return audio_path
-# Example usage
-emotion = "happy"
-output_audio = generate_emotional_speech("Welcome to the smart library!", emotion)
-print(f"Generated Speech Saved At: {output_audio}")
-"""Integrating the Workflow"""
 from IPython.display import Audio, display
 def emotion_aware_tts_pipeline(text):
@@ -115,12 +99,9 @@ def emotion_aware_tts_pipeline(text):
 # Example usage
 emotion_aware_tts_pipeline("I can’t stooop smiiiling, everything feels perrrfect!")
-"""Fine-tuning the Emotion Detection Model"""
 import os
 os.environ["WANDB_DISABLED"] = "true"
-from google.colab import drive
 from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
 from datasets import load_dataset
@@ -132,9 +113,6 @@ tokenizer = AutoTokenizer.from_pretrained("bhadresh-savani/distilbert-base-uncas
 # Define a function to map emotion labels to integers
 def map_emotion_to_int(example):
-    # Assuming your dataset has an 'emotion' column with string labels
-    # Replace this with your actual emotion labels and their corresponding integers
-    # **Change 'emotion' to the actual column name in your dataset**
     emotion_mapping = {
         "neutral": 0,
         "joy": 1,
@@ -145,27 +123,18 @@ def map_emotion_to_int(example):
         "disgust": 6,
         "shame": 7,
     }
-    # Assuming your emotion column is named 'label'
-    # example['label'] = emotion_mapping[example['emotion']] # Create a new 'label' column with integer values
-    example['label'] = emotion_mapping.get(example['label'], -1) # If the label is not in the emotion mapping then we set it to -1. We can later filter these examples out
     return example
 def preprocess_data(example):
-    return tokenizer(example['text'], truncation=True, padding=True, max_length=512) # Added max_length for consistency
 # Apply emotion mapping before tokenization
 dataset = dataset.map(map_emotion_to_int, batched=False)
-# **Keep the 'label' column for training. Only remove 'text'**
-# Filter out examples with labels not in emotion_mapping (-1)
-dataset = dataset.filter(lambda example: example['label'] != -1) # Filter out examples with label -1
 tokenized_dataset = dataset.map(preprocess_data, batched=True, remove_columns=['text'])
 # Load model
-# model = AutoModelForSequenceClassification.from_pretrained("bhadresh-savani/distilbert-base-uncased-emotion", num_labels=8)
-# Load model with ignore_mismatched_sizes=True
 model = AutoModelForSequenceClassification.from_pretrained(
     "bhadresh-savani/distilbert-base-uncased-emotion",
     num_labels=8,
@@ -174,25 +143,24 @@ model = AutoModelForSequenceClassification.from_pretrained(
 # Training arguments
 training_args = TrainingArguments(
-    output_dir="./results",    # Directory for model checkpoints and logs
-    evaluation_strategy="epoch",  # Evaluate after every epoch
-    learning_rate=5e-5,  # Start with 5e-5 (slightly higher than default 2e-5)
-    per_device_train_batch_size=16,  # Use 16 for balance between memory usage and training speed
-    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
-    num_train_epochs=5,  # Train for 4-5 epochs (typically enough for fine-tuning)
-    weight_decay=0.01,  # Regularization to avoid overfitting
-    save_strategy="epoch",  # Save checkpoints after each epoch
-    logging_dir="./logs",  # Directory for logging
-    logging_steps=100,  # Log every 100 steps
-    warmup_steps=500,  # Gradual learning rate increase for the first 500 steps
-    save_total_limit=3,  # Keep only the last 3 checkpoints
-    fp16=True,  # Enable mixed precision for faster training if GPU supports it
-    load_best_model_at_end=True,  # Load the best model at the end of training
-    metric_for_best_model="eval_loss",  # Use evaluation loss to select the best model
-    greater_is_better=False,  # Lower loss is better
 )
 # Train model
 trainer = Trainer(
     model=model,
@@ -204,18 +172,15 @@ trainer = Trainer(
 trainer.train()
-# Save the model and tokenizer to Google Drive
 model_save_path = "/content/drive/My Drive/emotion_detection_model1"
 tokenizer_save_path = "/content/drive/My Drive/emotion_detection_model1"
-# Save the fine-tuned model
 model.save_pretrained(model_save_path)
 tokenizer.save_pretrained(tokenizer_save_path)
 print("Model and tokenizer saved to Google Drive.")
-"""Reload the Fine-Tuned Model"""
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 # Mount Google Drive
@@ -232,8 +197,7 @@ tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
 print("Fine-tuned model and tokenizer loaded successfully.")
-"""Test the Reloaded Model"""
 from transformers import pipeline
 # Create a text classification pipeline with the loaded model
@@ -244,61 +208,10 @@ text = "I feel so upset today!"
 result = emotion_classifier(text)
 print(result)
-"""Fine-tuning the TTS System"""
-from TTS.api import TTS
-from TTS.utils.audio import AudioProcessor
-from TTS.tts.models.tacotron2 import Tacotron2
-import torch
-# Load pre-trained model
-#model = Tacotron2.load_model("tts_models/en/ljspeech/tacotron2-DDC")
-tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC") # Use TTS for model loading
-# Access the Tacotron2 model from the TTS object
-model = tts.synthesizer.tts_model
-# Fine-tuning parameters
-model.config.dataset_path = "/content/drive/MyDrive/RAVDESS"
-model.config.num_epochs = 10
-# Train
-model.train()
-# Define the save path on Google Drive
-save_path = "/content/drive/My Drive/fine_tuned_tacotron2.pth"
-# Save the model's state dictionary using torch.save
-torch.save(model.state_dict(), save_path)
-"""Set up the Gradio interface"""
 import gradio as gr
-from transformers import pipeline
-from TTS.api import TTS
-# Load pre-trained emotion detection model
-emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion")
-# Load TTS model
-tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
-# Emotion-specific settings for pitch and speed
-emotion_settings = {
-    "neutral": {"pitch": 1.0, "speed": 1.0},
-    "joy": {"pitch": 1.3, "speed": 1.2},
-    "sadness": {"pitch": 0.8, "speed": 0.9},
-    "anger": {"pitch": 1.6, "speed": 1.4},
-    "fear": {"pitch": 1.2, "speed": 0.95},
-    "surprise": {"pitch": 1.5, "speed": 1.3},
-    "disgust": {"pitch": 0.9, "speed": 0.95},
-    "shame": {"pitch": 0.8, "speed": 0.85},
-}
-# Function to process text or file input and generate audio
-def emotion_aware_tts_pipeline(input_text=None, file_input=None):
     try:
         # Get text from input or file
         if file_input:
@@ -318,9 +231,11 @@ def emotion_aware_tts_pipeline(input_text=None, file_input=None):
             # Generate audio
             audio_path = "output.wav"
-            tts_model.tts_to_file(text=input_text, file_path=audio_path, speed=speed, pitch=pitch)
             return f"Detected Emotion: {emotion} (Confidence: {confidence:.2f})", audio_path
         else:
@@ -328,11 +243,9 @@ def emotion_aware_tts_pipeline(input_text=None, file_input=None):
     except Exception as e:
         return f"Error: {str(e)}", None
 # Define Gradio interface
 iface = gr.Interface(
-    fn=emotion_aware_tts_pipeline,
     inputs=[
         gr.Textbox(label="Input Text", placeholder="Enter text here"),
         gr.File(label="Upload a Text File")
@@ -346,4 +259,4 @@ iface = gr.Interface(
 )
 # Launch Gradio interface
-iface.launch()

 from google.colab import drive
 drive.mount('/content/drive')
+# Install Dependencies
+!pip install transformers librosa torch soundfile numba numpy TTS datasets gradio protobuf==3.20.3
+# Emotion Detection (Using Text Dataset)
 !pip install --upgrade numpy tensorflow transformers TTS
 from transformers import pipeline
 # Load pre-trained model for emotion detection
 emotion, confidence = detect_emotion(text)
 print(f"Detected Emotion: {emotion}, Confidence: {confidence}")
+# Emotion-Aware TTS (Using Tacotron 2 or Similar)
 import torch
 import librosa
 import numpy as np
 # Load TTS model and vocoder automatically during initialization
 tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
+# HiFi-GAN Vocoder (Ensure you have the model or download it)
+from TTS.utils.generic_utils import download_model
+from TTS.vocoder.hifigan import HIFIGAN
+vocoder_model = download_model("hifigan_ljspeech")
+vocoder = HIFIGAN(vocoder_model)
+# Emotion-specific settings for pitch, speed, and prosody
+emotion_settings = {
+    "neutral": {"pitch": 1.0, "speed": 1.0, "prosody": 0.5},  # Neutral tone
+    "joy": {"pitch": 1.3, "speed": 1.2, "prosody": 1.5},      # Upbeat, energetic
+    "sadness": {"pitch": 0.8, "speed": 0.9, "prosody": 0.8},   # Subdued, slow tone
+    "anger": {"pitch": 1.6, "speed": 1.4, "prosody": 1.8},     # Sharp, intense
+    "fear": {"pitch": 1.2, "speed": 0.95, "prosody": 1.2},     # Tense, slow
+    "surprise": {"pitch": 1.5, "speed": 1.3, "prosody": 1.4},  # Excited, high energy
+    "disgust": {"pitch": 0.9, "speed": 0.95, "prosody": 0.6},  # Low, deliberate
+    "shame": {"pitch": 0.8, "speed": 0.85, "prosody": 0.5},    # Quiet, subdued tone
 }
+def adjust_pitch_and_speed(audio_path, pitch_factor, speed_factor):
+    # Load audio file
     y, sr = librosa.load(audio_path)
     # Adjust pitch
+    y_pitch = librosa.effects.pitch_shift(y, sr, n_steps=pitch_factor)
+    # Adjust speed
+    y_speed = librosa.effects.time_stretch(y_pitch, speed_factor)
+    # Save the adjusted audio
+    sf.write(audio_path, y_speed, sr)
+def generate_emotional_speech(text, emotion):
+    # Retrieve pitch, speed, and prosody based on detected emotion
+    settings = emotion_settings.get(emotion, {"pitch": 1.0, "speed": 1.0, "prosody": 1.0})
+    pitch = settings["pitch"]
+    speed = settings["speed"]
+    # Generate mel spectrogram with TTS
+    mel_spectrogram = tts_model.get_mel_spectrogram(text)
+    # Use HiFi-GAN vocoder to decode the spectrogram into waveform
+    audio = vocoder.decode(mel_spectrogram)
+    audio_path = "output.wav"
+    librosa.output.write_wav(audio_path, audio, sr=22050)  # Save the initial audio
+    # Apply post-processing: adjust pitch and speed
+    adjust_pitch_and_speed(audio_path, pitch_factor=pitch, speed_factor=speed)
+    return audio_path
+# Integrating Emotion Detection and TTS Pipeline
 from IPython.display import Audio, display
 def emotion_aware_tts_pipeline(text):
 # Example usage
 emotion_aware_tts_pipeline("I can’t stooop smiiiling, everything feels perrrfect!")
+# Fine-tuning the Emotion Detection Model (if needed)
 import os
 os.environ["WANDB_DISABLED"] = "true"
 from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
 from datasets import load_dataset
 # Define a function to map emotion labels to integers
 def map_emotion_to_int(example):
     emotion_mapping = {
         "neutral": 0,
         "joy": 1,
         "disgust": 6,
         "shame": 7,
     }
+    example['label'] = emotion_mapping.get(example['label'], -1)
     return example
 def preprocess_data(example):
+    return tokenizer(example['text'], truncation=True, padding=True, max_length=512)
 # Apply emotion mapping before tokenization
 dataset = dataset.map(map_emotion_to_int, batched=False)
+dataset = dataset.filter(lambda example: example['label'] != -1)
 tokenized_dataset = dataset.map(preprocess_data, batched=True, remove_columns=['text'])
 # Load model
 model = AutoModelForSequenceClassification.from_pretrained(
     "bhadresh-savani/distilbert-base-uncased-emotion",
     num_labels=8,
 # Training arguments
 training_args = TrainingArguments(
+    output_dir="./results",
+    evaluation_strategy="epoch",
+    learning_rate=5e-5,
+    per_device_train_batch_size=16,
+    gradient_accumulation_steps=4,
+    num_train_epochs=5,
+    weight_decay=0.01,
+    save_strategy="epoch",
+    logging_dir="./logs",
+    logging_steps=100,
+    warmup_steps=500,
+    save_total_limit=3,
+    fp16=True,
+    load_best_model_at_end=True,
+    metric_for_best_model="eval_loss",
+    greater_is_better=False,
 )
 # Train model
 trainer = Trainer(
     model=model,
 trainer.train()
+# Save the model and tokenizer
 model_save_path = "/content/drive/My Drive/emotion_detection_model1"
 tokenizer_save_path = "/content/drive/My Drive/emotion_detection_model1"
 model.save_pretrained(model_save_path)
 tokenizer.save_pretrained(tokenizer_save_path)
 print("Model and tokenizer saved to Google Drive.")
+# Reload the Fine-Tuned Model
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 # Mount Google Drive
 print("Fine-tuned model and tokenizer loaded successfully.")
+# Test the Reloaded Model
 from transformers import pipeline
 # Create a text classification pipeline with the loaded model
 result = emotion_classifier(text)
 print(result)
+# Set up the Gradio interface
 import gradio as gr
+def emotion_aware_tts_pipeline_gradio(input_text=None, file_input=None):
     try:
         # Get text from input or file
         if file_input:
             # Generate audio
             audio_path = "output.wav"
+            mel_spectrogram = tts_model.get_mel_spectrogram(input_text)
+            audio = vocoder.decode(mel_spectrogram)
+            # Post-processing: adjust pitch and speed
+            adjust_pitch_and_speed(audio_path, pitch_factor=pitch, speed_factor=speed)
             return f"Detected Emotion: {emotion} (Confidence: {confidence:.2f})", audio_path
         else:
     except Exception as e:
         return f"Error: {str(e)}", None
 # Define Gradio interface
 iface = gr.Interface(
+    fn=emotion_aware_tts_pipeline_gradio,
     inputs=[
         gr.Textbox(label="Input Text", placeholder="Enter text here"),
         gr.File(label="Upload a Text File")
 )
 # Launch Gradio interface
+iface.launch()