Spaces:

Anita-19
/

emotion-aware-tts

Runtime error

File size: 12,244 Bytes

from google.colab import drive
drive.mount('/content/drive')

"""Install Dependencies"""

pip install transformers librosa torch soundfile numba numpy TTS datasets gradio protobuf==3.20.3

"""Emotion Detection (Using Text Dataset)

"""

!pip install --upgrade numpy tensorflow transformers TTS

!pip freeze > requirements.txt

from transformers import pipeline

# Load pre-trained model for emotion detection
emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion")

def detect_emotion(text):
    result = emotion_classifier(text)
    emotion = result[0]['label']
    confidence = result[0]['score']
    return emotion, confidence

# Example usage
text = "I am feeling excited today!"
emotion, confidence = detect_emotion(text)
print(f"Detected Emotion: {emotion}, Confidence: {confidence}")

"""Emotion-Aware TTS (Using Tacotron 2 or Similar)"""

import torch
import librosa
import numpy as np
from TTS.api import TTS  # Using Coqui TTS for simplicity

# Load TTS model and vocoder automatically during initialization
tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")


def generate_emotional_speech(text, emotion):
    # Map emotion to voice modulation parameters (pitch, speed)
    emotion_settings = {
    "neutral": {"pitch": 1.0, "speed": 1.0},     # Baseline conversational tone
    "joy": {"pitch": 1.3, "speed": 1.2},         # Upbeat and energetic
    "sadness": {"pitch": 0.8, "speed": 0.9},     # Subdued, slow tone
    "anger": {"pitch": 1.6, "speed": 1.4},       # Intense and sharp
    "fear": {"pitch": 1.2, "speed": 0.95},       # Tense and slightly slow
    "surprise": {"pitch": 1.5, "speed": 1.3},    # Excitement with high pitch and fast speech
    "disgust": {"pitch": 0.9, "speed": 0.95},    # Low and deliberate
    "shame": {"pitch": 0.8, "speed": 0.85},      # Quiet, subdued tone

}


    # Retrieve pitch and speed based on detected emotion
    settings = emotion_settings.get(emotion, {"pitch": 1.0, "speed": 1.0})
    # Generate speech with the TTS model
    # Instead of directly passing speed and pitch to tts_to_file,
    # We adjust the text to simulate the effect. This is a temporary solution.
    # You might need to fine-tune these adjustments or consider a different TTS library
    # with better control over speech parameters.
    adjusted_text = text
    if settings['speed'] > 1.0:
        adjusted_text = adjusted_text.replace(" ", ".")  # Simulate faster speech
    elif settings['speed'] < 1.0:
        adjusted_text = adjusted_text.replace(" ", "...")  # Simulate slower speech

    # Explicitly specify the output path
    audio_path = "output.wav"  # Or any desired filename
    tts_model.tts_to_file(text=adjusted_text, file_path=audio_path)  # Pass file_path argument
    return audio_path

# Example usage
emotion = "happy"
output_audio = generate_emotional_speech("Welcome to the smart library!", emotion)
print(f"Generated Speech Saved At: {output_audio}")

"""Integrating the Workflow"""

from IPython.display import Audio, display

def emotion_aware_tts_pipeline(text):
    emotion, confidence = detect_emotion(text)
    print(f"Emotion Detected: {emotion} with Confidence: {confidence:.2f}")

    audio_path = generate_emotional_speech(text, emotion)
    print(f"Audio Generated: {audio_path}")

    # Display and play the audio
    display(Audio(audio_path, autoplay=True))

# Example usage
emotion_aware_tts_pipeline("I can’t stooop smiiiling, everything feels perrrfect!")

"""Fine-tuning the Emotion Detection Model"""

import os
os.environ["WANDB_DISABLED"] = "true"

from google.colab import drive
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

# Load dataset
dataset = load_dataset('/content/drive/MyDrive/Emotion_Model')  #path_to_your_dataset

# Preprocess data
tokenizer = AutoTokenizer.from_pretrained("bhadresh-savani/distilbert-base-uncased-emotion")

# Define a function to map emotion labels to integers
def map_emotion_to_int(example):
    # Assuming your dataset has an 'emotion' column with string labels
    # Replace this with your actual emotion labels and their corresponding integers
    # *Change 'emotion' to the actual column name in your dataset*
    emotion_mapping = {
        "neutral": 0,
        "joy": 1,
        "sad": 2,
        "anger": 3,
        "fear": 4,
        "surprise": 5,
        "disgust": 6,
        "shame": 7,
    }
    # Assuming your emotion column is named 'label'
    # example['label'] = emotion_mapping[example['emotion']] # Create a new 'label' column with integer values
    example['label'] = emotion_mapping.get(example['label'], -1) # If the label is not in the emotion mapping then we set it to -1. We can later filter these examples out
    return example


def preprocess_data(example):
    return tokenizer(example['text'], truncation=True, padding=True, max_length=512) # Added max_length for consistency


# Apply emotion mapping before tokenization
dataset = dataset.map(map_emotion_to_int, batched=False)
# *Keep the 'label' column for training. Only remove 'text'*
# Filter out examples with labels not in emotion_mapping (-1)
dataset = dataset.filter(lambda example: example['label'] != -1) # Filter out examples with label -1
tokenized_dataset = dataset.map(preprocess_data, batched=True, remove_columns=['text'])

# Load model
# model = AutoModelForSequenceClassification.from_pretrained("bhadresh-savani/distilbert-base-uncased-emotion", num_labels=8)

# Load model with ignore_mismatched_sizes=True
model = AutoModelForSequenceClassification.from_pretrained(
    "bhadresh-savani/distilbert-base-uncased-emotion",
    num_labels=8,
    ignore_mismatched_sizes=True
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",    # Directory for model checkpoints and logs
    evaluation_strategy="epoch",  # Evaluate after every epoch
    learning_rate=5e-5,  # Start with 5e-5 (slightly higher than default 2e-5)
    per_device_train_batch_size=16,  # Use 16 for balance between memory usage and training speed
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
    num_train_epochs=5,  # Train for 4-5 epochs (typically enough for fine-tuning)
    weight_decay=0.01,  # Regularization to avoid overfitting
    save_strategy="epoch",  # Save checkpoints after each epoch
    logging_dir="./logs",  # Directory for logging
    logging_steps=100,  # Log every 100 steps
    warmup_steps=500,  # Gradual learning rate increase for the first 500 steps
    save_total_limit=3,  # Keep only the last 3 checkpoints
    fp16=True,  # Enable mixed precision for faster training if GPU supports it
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="eval_loss",  # Use evaluation loss to select the best model
    greater_is_better=False,  # Lower loss is better
)


# Train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
)

trainer.train()

# Save the model and tokenizer to Google Drive
model_save_path = "/content/drive/My Drive/emotion_detection_model1"
tokenizer_save_path = "/content/drive/My Drive/emotion_detection_model1"

# Save the fine-tuned model
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

print("Model and tokenizer saved to Google Drive.")

"""Reload the Fine-Tuned Model"""

from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Path to the saved model and tokenizer
model_save_path = "/content/drive/My Drive/emotion_detection_model"
tokenizer_save_path = "/content/drive/My Drive/emotion_detection_model"

# Load the fine-tuned model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)

print("Fine-tuned model and tokenizer loaded successfully.")

"""Test the Reloaded Model"""

from transformers import pipeline

# Create a text classification pipeline with the loaded model
emotion_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Test with a sample text
text = "I feel so upset today!"
result = emotion_classifier(text)
print(result)

"""Fine-tuning the TTS System"""

from TTS.api import TTS
from TTS.utils.audio import AudioProcessor
from TTS.tts.models.tacotron2 import Tacotron2
import torch

# Load pre-trained model
#model = Tacotron2.load_model("tts_models/en/ljspeech/tacotron2-DDC")
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC") # Use TTS for model loading

# Access the Tacotron2 model from the TTS object
model = tts.synthesizer.tts_model

# Fine-tuning parameters
model.config.dataset_path = "/content/drive/MyDrive/RAVDESS"
model.config.num_epochs = 10

# Train
model.train()

# Define the save path on Google Drive
save_path = "/content/drive/My Drive/fine_tuned_tacotron2.pth"

# Save the model's state dictionary using torch.save
torch.save(model.state_dict(), save_path)
    

"""Set up the Gradio interface"""

import gradio as gr
from transformers import pipeline
from TTS.api import TTS

# Load pre-trained emotion detection model
emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion")

# Load TTS model
tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")

# Emotion-specific settings for pitch and speed
emotion_settings = {
    "neutral": {"pitch": 1.0, "speed": 1.0},
    "joy": {"pitch": 1.3, "speed": 1.2},
    "sadness": {"pitch": 0.8, "speed": 0.9},
    "anger": {"pitch": 1.6, "speed": 1.4},
    "fear": {"pitch": 1.2, "speed": 0.95},
    "surprise": {"pitch": 1.5, "speed": 1.3},
    "disgust": {"pitch": 0.9, "speed": 0.95},
    "shame": {"pitch": 0.8, "speed": 0.85},
}

import librosa
import soundfile as sf

def adjust_audio_speed(audio_path, speed_factor):
    y, sr = librosa.load(audio_path)
    y_speeded = librosa.effects.time_stretch(y, speed_factor)
    sf.write(audio_path, y_speeded, sr)

def adjust_audio_pitch(audio_path, pitch_factor):
    y, sr = librosa.load(audio_path)
    y_shifted = librosa.effects.pitch_shift(y, sr, n_steps=pitch_factor)
    sf.write(audio_path, y_shifted, sr)

def emotion_aware_tts_pipeline(input_text=None, file_input=None):
    try:
        # Get text from input or file
        if file_input:
            with open(file_input.name, 'r') as file:
                input_text = file.read()

        if input_text:
            # Detect emotion
            emotion_data = emotion_classifier(input_text)[0]
            emotion = emotion_data['label']
            confidence = emotion_data['score']

            # Adjust pitch and speed
            settings = emotion_settings.get(emotion.lower(), {"pitch": 1.0, "speed": 1.0})
            pitch = settings["pitch"]
            speed = settings["speed"]

            # Generate audio
            audio_path = "output.wav"
            tts_model.tts_to_file(text=input_text, file_path=audio_path)

            # Adjust pitch and speed using librosa
            if pitch != 1.0:
                adjust_audio_pitch(audio_path, pitch)
            if speed != 1.0:
                adjust_audio_speed(audio_path, speed)

            return f"Detected Emotion: {emotion} (Confidence: {confidence:.2f})", audio_path
        else:
            return "Please provide input text or file", None
    except Exception as e:
        return f"Error: {str(e)}", None

  

# Define Gradio interface
iface = gr.Interface(
    fn=emotion_aware_tts_pipeline,
    inputs=[
        gr.Textbox(label="Input Text", placeholder="Enter text here"),
        gr.File(label="Upload a Text File")
    ],
    outputs=[
        gr.Textbox(label="Detected Emotion"),
        gr.Audio(label="Generated Audio")
    ],
    title="Emotion-Aware Text-to-Speech",
    description="Input text or upload a text file to detect the emotion and generate audio with emotion-aware modulation."
)

# Launch Gradio interface
iface.launch()