Anita-19 commited on
Commit
bbd5b58
Β·
verified Β·
1 Parent(s): 8573520

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +51 -49
main.py CHANGED
@@ -1,17 +1,16 @@
1
  from google.colab import drive
2
  drive.mount('/content/drive')
3
 
4
- """Install Dependencies"""
5
-
6
  pip install transformers librosa torch soundfile numba numpy TTS datasets gradio protobuf==3.20.3
7
 
8
- """Emotion Detection (Using Text Dataset)
9
 
10
- """
11
 
12
  !pip install --upgrade numpy tensorflow transformers TTS
13
 
14
- !pip freeze > requirements.txt
15
 
16
  from transformers import pipeline
17
 
@@ -29,7 +28,7 @@ text = "I am feeling excited today!"
29
  emotion, confidence = detect_emotion(text)
30
  print(f"Detected Emotion: {emotion}, Confidence: {confidence}")
31
 
32
- """Emotion-Aware TTS (Using Tacotron 2 or Similar)"""
33
 
34
  import torch
35
  import librosa
@@ -40,17 +39,19 @@ from TTS.api import TTS # Using Coqui TTS for simplicity
40
  tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
41
 
42
 
43
- emotion_settings = {
44
- "neutral": {"pitch": 1.0, "speed": 1.0, "prosody": 0.5}, # Neutral tone
45
- "joy": {"pitch": 1.3, "speed": 1.2, "prosody": 1.5}, # Upbeat, energetic
46
- "sadness": {"pitch": 0.8, "speed": 0.9, "prosody": 0.8}, # Subdued, slow tone
47
- "anger": {"pitch": 1.6, "speed": 1.4, "prosody": 1.8}, # Sharp, intense
48
- "fear": {"pitch": 1.2, "speed": 0.95, "prosody": 1.2}, # Tense, slow
49
- "surprise": {"pitch": 1.5, "speed": 1.3, "prosody": 1.4}, # Excited, high energy
50
- "disgust": {"pitch": 0.9, "speed": 0.95, "prosody": 0.6}, # Low, deliberate
51
- "shame": {"pitch": 0.8, "speed": 0.85, "prosody": 0.5}, # Quiet, subdued tone
52
- }
 
53
 
 
54
 
55
  import librosa
56
  import soundfile as sf
@@ -73,32 +74,34 @@ def adjust_speed(audio_path, speed_factor):
73
  # Save the adjusted audio
74
  sf.write(audio_path, y_speeded, sr)
75
 
76
-
77
-
78
- def generate_emotional_speech(text, emotion):
79
- # Retrieve pitch, speed, and prosody based on detected emotion
80
- settings = emotion_settings.get(emotion, {"pitch": 1.0, "speed": 1.0, "prosody": 1.0})
81
- pitch = settings["pitch"]
82
- speed = settings["speed"]
83
-
84
- # Generate mel spectrogram with TTS
85
- mel_spectrogram = tts_model.get_mel_spectrogram(text)
86
-
87
- # Use HiFi-GAN vocoder to decode the spectrogram into waveform
88
- audio = vocoder.decode(mel_spectrogram)
89
- audio_path = "output.wav"
90
- librosa.output.write_wav(audio_path, audio, sr=22050) # Save the initial audio
 
 
91
 
92
- # Apply post-processing: adjust pitch and speed
93
- adjust_pitch_and_speed(audio_path, pitch_factor=pitch, speed_factor=speed)
 
 
94
 
95
- return audio_path
96
 
97
- # Integrating Emotion Detection and TTS Pipeline
98
  from IPython.display import Audio, display
99
 
100
  def emotion_aware_tts_pipeline(text):
101
- # Ensure the emotion_classifier is being accessed globally
102
  emotion, confidence = detect_emotion(text)
103
  print(f"Emotion Detected: {emotion} with Confidence: {confidence:.2f}")
104
 
@@ -210,7 +213,7 @@ tokenizer.save_pretrained(tokenizer_save_path)
210
 
211
  print("Model and tokenizer saved to Google Drive.")
212
 
213
- """Reload the Fine-Tuned Model"""
214
 
215
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
216
 
@@ -228,7 +231,7 @@ tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
228
 
229
  print("Fine-tuned model and tokenizer loaded successfully.")
230
 
231
- """Test the Reloaded Model"""
232
 
233
  from transformers import pipeline
234
 
@@ -240,7 +243,7 @@ text = "I feel so upset today!"
240
  result = emotion_classifier(text)
241
  print(result)
242
 
243
- """Fine-tuning the TTS System"""
244
 
245
  from TTS.api import TTS
246
  from TTS.utils.audio import AudioProcessor
@@ -268,7 +271,7 @@ save_path = "/content/drive/My Drive/fine_tuned_tacotron2.pth"
268
  torch.save(model.state_dict(), save_path)
269
 
270
 
271
- """Set up the Gradio interface"""
272
 
273
  import gradio as gr
274
  from transformers import pipeline
@@ -282,18 +285,17 @@ tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
282
 
283
  # Emotion-specific settings for pitch and speed
284
  emotion_settings = {
285
- "neutral": {"pitch": 1.0, "speed": 1.0, "prosody": 0.5}, # Neutral tone
286
- "joy": {"pitch": 1.3, "speed": 1.2, "prosody": 1.5}, # Upbeat, energetic
287
- "sadness": {"pitch": 0.8, "speed": 0.9, "prosody": 0.8}, # Subdued, slow tone
288
- "anger": {"pitch": 1.6, "speed": 1.4, "prosody": 1.8}, # Sharp, intense
289
- "fear": {"pitch": 1.2, "speed": 0.95, "prosody": 1.2}, # Tense, slow
290
- "surprise": {"pitch": 1.5, "speed": 1.3, "prosody": 1.4}, # Excited, high energy
291
- "disgust": {"pitch": 0.9, "speed": 0.95, "prosody": 0.6}, # Low, deliberate
292
- "shame": {"pitch": 0.8, "speed": 0.85, "prosody": 0.5}, # Quiet, subdued tone
293
  }
294
 
295
 
296
-
297
  # Function to process text or file input and generate audio
298
  def emotion_aware_tts_pipeline(input_text=None, file_input=None):
299
  try:
 
1
  from google.colab import drive
2
  drive.mount('/content/drive')
3
 
4
+ #Install Dependencies"""
5
+ """
6
  pip install transformers librosa torch soundfile numba numpy TTS datasets gradio protobuf==3.20.3
7
 
8
+ #Emotion Detection (Using Text Dataset)
9
 
 
10
 
11
  !pip install --upgrade numpy tensorflow transformers TTS
12
 
13
+ !pip freeze > requirements.txt"""
14
 
15
  from transformers import pipeline
16
 
 
28
  emotion, confidence = detect_emotion(text)
29
  print(f"Detected Emotion: {emotion}, Confidence: {confidence}")
30
 
31
+ #Emotion-Aware TTS (Using Tacotron 2 or Similar)"""
32
 
33
  import torch
34
  import librosa
 
39
  tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
40
 
41
 
42
+ def generate_emotional_speech(text, emotion):
43
+ # Map emotion to voice modulation parameters (pitch, speed)
44
+ emotion_settings = {
45
+ "neutral": {"pitch": 1.0, "speed": 1.0}, # Baseline conversational tone
46
+ "joy": {"pitch": 1.3, "speed": 1.2}, # Upbeat and energetic
47
+ "sadness": {"pitch": 0.8, "speed": 0.9}, # Subdued, slow tone
48
+ "anger": {"pitch": 1.6, "speed": 1.4}, # Intense and sharp
49
+ "fear": {"pitch": 1.2, "speed": 0.95}, # Tense and slightly slow
50
+ "surprise": {"pitch": 1.5, "speed": 1.3}, # Excitement with high pitch and fast speech
51
+ "disgust": {"pitch": 0.9, "speed": 0.95}, # Low and deliberate
52
+ "shame": {"pitch": 0.8, "speed": 0.85}, # Quiet, subdued tone
53
 
54
+ }
55
 
56
  import librosa
57
  import soundfile as sf
 
74
  # Save the adjusted audio
75
  sf.write(audio_path, y_speeded, sr)
76
 
77
+ # Retrieve pitch and speed based on detected emotion
78
+ settings = emotion_settings.get(emotion, {"pitch": 1.0, "speed": 1.0})
79
+ # Generate speech with the TTS model
80
+ # Instead of directly passing speed and pitch to tts_to_file,
81
+ # We adjust the text to simulate the effect. This is a temporary solution.
82
+ # You might need to fine-tune these adjustments or consider a different TTS library
83
+ # with better control over speech parameters.
84
+ adjusted_text = text
85
+ if settings['speed'] > 1.0:
86
+ adjusted_text = adjusted_text.replace(" ", ".") # Simulate faster speech
87
+ elif settings['speed'] < 1.0:
88
+ adjusted_text = adjusted_text.replace(" ", "...") # Simulate slower speech
89
+
90
+ # Explicitly specify the output path
91
+ audio_path = "output.wav" # Or any desired filename
92
+ tts_model.tts_to_file(text=adjusted_text, file_path=audio_path) # Pass file_path argument
93
+ return audio_path
94
 
95
+ # Example usage
96
+ emotion = "happy"
97
+ output_audio = generate_emotional_speech("Welcome to the smart library!", emotion)
98
+ print(f"Generated Speech Saved At: {output_audio}")
99
 
100
+ """Integrating the Workflow"""
101
 
 
102
  from IPython.display import Audio, display
103
 
104
  def emotion_aware_tts_pipeline(text):
 
105
  emotion, confidence = detect_emotion(text)
106
  print(f"Emotion Detected: {emotion} with Confidence: {confidence:.2f}")
107
 
 
213
 
214
  print("Model and tokenizer saved to Google Drive.")
215
 
216
+ #Reload the Fine-Tuned Model"""
217
 
218
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
219
 
 
231
 
232
  print("Fine-tuned model and tokenizer loaded successfully.")
233
 
234
+ #Test the Reloaded Model"""
235
 
236
  from transformers import pipeline
237
 
 
243
  result = emotion_classifier(text)
244
  print(result)
245
 
246
+ #Fine-tuning the TTS System"""
247
 
248
  from TTS.api import TTS
249
  from TTS.utils.audio import AudioProcessor
 
271
  torch.save(model.state_dict(), save_path)
272
 
273
 
274
+ #Set up the Gradio interface
275
 
276
  import gradio as gr
277
  from transformers import pipeline
 
285
 
286
  # Emotion-specific settings for pitch and speed
287
  emotion_settings = {
288
+ "neutral": {"pitch": 1.0, "speed": 1.0},
289
+ "joy": {"pitch": 1.3, "speed": 1.2},
290
+ "sadness": {"pitch": 0.8, "speed": 0.9},
291
+ "anger": {"pitch": 1.6, "speed": 1.4},
292
+ "fear": {"pitch": 1.2, "speed": 0.95},
293
+ "surprise": {"pitch": 1.5, "speed": 1.3},
294
+ "disgust": {"pitch": 0.9, "speed": 0.95},
295
+ "shame": {"pitch": 0.8, "speed": 0.85},
296
  }
297
 
298
 
 
299
  # Function to process text or file input and generate audio
300
  def emotion_aware_tts_pipeline(input_text=None, file_input=None):
301
  try: