Spaces:

Anita-19
/

emotion-aware-tts

Runtime error

App Files Files Community

Anita-19 commited on Jan 13

Commit

b10cf5d

verified ·

1 Parent(s): b2af844

app.py

Browse files

Files changed (1) hide show

app.py +323 -0

app.py ADDED Viewed

	@@ -0,0 +1,323 @@

+from google.colab import drive
+drive.mount('/content/drive')
+"""Install Dependencies"""
+pip install transformers librosa torch soundfile numba numpy TTS datasets gradio protobuf==3.20.3
+"""Emotion Detection (Using Text Dataset)
+"""
+!pip install --upgrade numpy tensorflow transformers TTS
+!pip freeze > requirements.txt
+from transformers import pipeline
+# Load pre-trained model for emotion detection
+emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion")
+def detect_emotion(text):
+    result = emotion_classifier(text)
+    emotion = result[0]['label']
+    confidence = result[0]['score']
+    return emotion, confidence
+# Example usage
+text = "I am feeling excited today!"
+emotion, confidence = detect_emotion(text)
+print(f"Detected Emotion: {emotion}, Confidence: {confidence}")
+"""Emotion-Aware TTS (Using Tacotron 2 or Similar)"""
+import torch
+import librosa
+import numpy as np
+from TTS.api import TTS  # Using Coqui TTS for simplicity
+# Load TTS model and vocoder automatically during initialization
+tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
+def generate_emotional_speech(text, emotion):
+    # Map emotion to voice modulation parameters (pitch, speed)
+    emotion_settings = {
+    "happy": {"pitch": 1.3, "speed": 1.2},       # Upbeat and energetic
+    "joy": {"pitch": 1.2, "speed": 1.1},         # Less exaggerated than 'happy'
+    "surprise": {"pitch": 1.5, "speed": 1.3},    # Excitement with high pitch and fast speech
+    "sad": {"pitch": 0.8, "speed": 0.9},         # Subdued, slow tone
+    "angry": {"pitch": 1.6, "speed": 1.4},       # Intense and sharp
+    "fear": {"pitch": 1.2, "speed": 0.95},       # Tense and slightly slow
+    "disgust": {"pitch": 0.9, "speed": 0.95},    # Low and deliberate
+    "shame": {"pitch": 0.8, "speed": 0.85},      # Quiet, subdued tone
+    "neutral": {"pitch": 1.0, "speed": 1.0},     # Baseline conversational tone
+}
+    # Retrieve pitch and speed based on detected emotion
+    settings = emotion_settings.get(emotion, {"pitch": 1.0, "speed": 1.0})
+    # Generate speech with the TTS model
+    # Instead of directly passing speed and pitch to tts_to_file,
+    # We adjust the text to simulate the effect. This is a temporary solution.
+    # You might need to fine-tune these adjustments or consider a different TTS library
+    # with better control over speech parameters.
+    adjusted_text = text
+    if settings['speed'] > 1.0:
+        adjusted_text = adjusted_text.replace(" ", ".")  # Simulate faster speech
+    elif settings['speed'] < 1.0:
+        adjusted_text = adjusted_text.replace(" ", "...")  # Simulate slower speech
+    # Explicitly specify the output path
+    audio_path = "output.wav"  # Or any desired filename
+    tts_model.tts_to_file(text=adjusted_text, file_path=audio_path)  # Pass file_path argument
+    return audio_path
+# Example usage
+emotion = "happy"
+output_audio = generate_emotional_speech("Welcome to the smart library!", emotion)
+print(f"Generated Speech Saved At: {output_audio}")
+"""Integrating the Workflow"""
+from IPython.display import Audio, display
+def emotion_aware_tts_pipeline(text):
+    emotion, confidence = detect_emotion(text)
+    print(f"Emotion Detected: {emotion} with Confidence: {confidence:.2f}")
+    audio_path = generate_emotional_speech(text, emotion)
+    print(f"Audio Generated: {audio_path}")
+    # Display and play the audio
+    display(Audio(audio_path, autoplay=True))
+# Example usage
+emotion_aware_tts_pipeline("I can’t stooop smiiiling, everything feels perrrfect!")
+"""Fine-tuning the Emotion Detection Model"""
+import os
+os.environ["WANDB_DISABLED"] = "true"
+from google.colab import drive
+from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
+from datasets import load_dataset
+# Load dataset
+dataset = load_dataset('/content/drive/MyDrive/Emotion_Model')  #path_to_your_dataset
+# Preprocess data
+tokenizer = AutoTokenizer.from_pretrained("bhadresh-savani/distilbert-base-uncased-emotion")
+# Define a function to map emotion labels to integers
+def map_emotion_to_int(example):
+    # Assuming your dataset has an 'emotion' column with string labels
+    # Replace this with your actual emotion labels and their corresponding integers
+    # **Change 'emotion' to the actual column name in your dataset**
+    emotion_mapping = {
+        "neutral": 0,
+        "joy": 1,
+        "sad": 2,
+        "anger": 3,
+        "fear": 4,
+        "surprise": 5,
+        "disgust": 6,
+        "shame": 7,
+    }
+    # Assuming your emotion column is named 'label'
+    # example['label'] = emotion_mapping[example['emotion']] # Create a new 'label' column with integer values
+    example['label'] = emotion_mapping.get(example['label'], -1) # If the label is not in the emotion mapping then we set it to -1. We can later filter these examples out
+    return example
+def preprocess_data(example):
+    return tokenizer(example['text'], truncation=True, padding=True, max_length=512) # Added max_length for consistency
+# Apply emotion mapping before tokenization
+dataset = dataset.map(map_emotion_to_int, batched=False)
+# **Keep the 'label' column for training. Only remove 'text'**
+# Filter out examples with labels not in emotion_mapping (-1)
+dataset = dataset.filter(lambda example: example['label'] != -1) # Filter out examples with label -1
+tokenized_dataset = dataset.map(preprocess_data, batched=True, remove_columns=['text'])
+# Load model
+# model = AutoModelForSequenceClassification.from_pretrained("bhadresh-savani/distilbert-base-uncased-emotion", num_labels=8)
+# Load model with ignore_mismatched_sizes=True
+model = AutoModelForSequenceClassification.from_pretrained(
+    "bhadresh-savani/distilbert-base-uncased-emotion",
+    num_labels=8,
+    ignore_mismatched_sizes=True
+)
+# Training arguments
+training_args = TrainingArguments(
+    output_dir="./results",    # Directory for model checkpoints and logs
+    evaluation_strategy="epoch",  # Evaluate after every epoch
+    learning_rate=5e-5,  # Start with 5e-5 (slightly higher than default 2e-5)
+    per_device_train_batch_size=16,  # Use 16 for balance between memory usage and training speed
+    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
+    num_train_epochs=5,  # Train for 4-5 epochs (typically enough for fine-tuning)
+    weight_decay=0.01,  # Regularization to avoid overfitting
+    save_strategy="epoch",  # Save checkpoints after each epoch
+    logging_dir="./logs",  # Directory for logging
+    logging_steps=100,  # Log every 100 steps
+    warmup_steps=500,  # Gradual learning rate increase for the first 500 steps
+    save_total_limit=3,  # Keep only the last 3 checkpoints
+    fp16=True,  # Enable mixed precision for faster training if GPU supports it
+    load_best_model_at_end=True,  # Load the best model at the end of training
+    metric_for_best_model="eval_loss",  # Use evaluation loss to select the best model
+    greater_is_better=False,  # Lower loss is better
+)
+# Train model
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_dataset['train'],
+    eval_dataset=tokenized_dataset['validation'],
+    tokenizer=tokenizer,
+)
+trainer.train()
+# Save the model and tokenizer to Google Drive
+model_save_path = "/content/drive/My Drive/emotion_detection_model1"
+tokenizer_save_path = "/content/drive/My Drive/emotion_detection_model1"
+# Save the fine-tuned model
+model.save_pretrained(model_save_path)
+tokenizer.save_pretrained(tokenizer_save_path)
+print("Model and tokenizer saved to Google Drive.")
+"""Reload the Fine-Tuned Model"""
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+# Mount Google Drive
+from google.colab import drive
+drive.mount('/content/drive')
+# Path to the saved model and tokenizer
+model_save_path = "/content/drive/My Drive/emotion_detection_model"
+tokenizer_save_path = "/content/drive/My Drive/emotion_detection_model"
+# Load the fine-tuned model and tokenizer
+model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
+tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
+print("Fine-tuned model and tokenizer loaded successfully.")
+"""Test the Reloaded Model"""
+from transformers import pipeline
+# Create a text classification pipeline with the loaded model
+emotion_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
+# Test with a sample text
+text = "I feel so upset today!"
+result = emotion_classifier(text)
+print(result)
+"""Fine-tuning the TTS System"""
+from TTS.api import TTS
+from TTS.utils.audio import AudioProcessor
+from TTS.tts.models.tacotron2 import Tacotron2
+import torch
+# Load pre-trained model
+#model = Tacotron2.load_model("tts_models/en/ljspeech/tacotron2-DDC")
+tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC") # Use TTS for model loading
+# Access the Tacotron2 model from the TTS object
+model = tts.synthesizer.tts_model
+# Fine-tuning parameters
+model.config.dataset_path = "/content/drive/MyDrive/RAVDESS"
+model.config.num_epochs = 10
+# Train
+model.train()
+# Define the save path on Google Drive
+save_path = "/content/drive/My Drive/fine_tuned_tacotron2.pth"
+# Save the model's state dictionary using torch.save
+torch.save(model.state_dict(), save_path)
+"""Set up the Gradio interface"""
+import gradio as gr
+from transformers import pipeline
+from TTS.api import TTS
+# Load pre-trained emotion detection model
+emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion")
+# Load TTS model
+tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
+# Emotion-specific settings for pitch and speed
+emotion_settings = {
+    "joy": {"pitch": 1.2, "speed": 1.1},
+    "sadness": {"pitch": 0.8, "speed": 0.9},
+    "anger": {"pitch": 1.0, "speed": 1.2},
+    "fear": {"pitch": 0.9, "speed": 1.0},
+    "surprise": {"pitch": 1.3, "speed": 1.2},
+    "neutral": {"pitch": 1.0, "speed": 1.0},
+}
+# Function to process text or file input and generate audio
+def emotion_aware_tts_pipeline(input_text=None, file_input=None):
+    try:
+        # Get text from input or file
+        if file_input:
+            with open(file_input.name, 'r') as file:
+                input_text = file.read()
+        if input_text:
+            # Detect emotion
+            emotion_data = emotion_classifier(input_text)[0]
+            emotion = emotion_data['label']
+            confidence = emotion_data['score']
+            # Adjust pitch and speed
+            settings = emotion_settings.get(emotion.lower(), {"pitch": 1.0, "speed": 1.0})
+            pitch = settings["pitch"]
+            speed = settings["speed"]
+            # Generate audio
+            audio_path = "output.wav"
+            tts_model.tts_to_file(text=input_text, file_path=audio_path, speed=speed, pitch=pitch)
+            return f"Detected Emotion: {emotion} (Confidence: {confidence:.2f})", audio_path
+        else:
+            return "Please provide input text or file", None
+    except Exception as e:
+        # Return error message if something goes wrong
+        return f"Error: {str(e)}", None
+# Define Gradio interface
+iface = gr.Interface(
+    fn=emotion_aware_tts_pipeline,
+    inputs=[
+        gr.Textbox(label="Input Text", placeholder="Enter text here"),
+        gr.File(label="Upload a Text File")
+    ],
+    outputs=[
+        gr.Textbox(label="Detected Emotion"),
+        gr.Audio(label="Generated Audio")
+    ],
+    title="Emotion-Aware Text-to-Speech",
+    description="Input text or upload a text file to detect the emotion and generate audio with emotion-aware modulation."
+)
+# Launch Gradio interface
+iface.launch()