Spaces:
Runtime error
Runtime error
Update main.py
Browse files
main.py
CHANGED
@@ -3,15 +3,15 @@ drive.mount('/content/drive')
|
|
3 |
|
4 |
"""Install Dependencies"""
|
5 |
|
6 |
-
|
7 |
|
8 |
"""Emotion Detection (Using Text Dataset)
|
9 |
|
10 |
"""
|
11 |
|
12 |
-
|
13 |
|
14 |
-
|
15 |
|
16 |
from transformers import pipeline
|
17 |
|
@@ -43,21 +43,26 @@ tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
|
|
43 |
def generate_emotional_speech(text, emotion):
|
44 |
# Map emotion to voice modulation parameters (pitch, speed)
|
45 |
emotion_settings = {
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
}
|
55 |
-
|
|
|
|
|
|
|
56 |
# Retrieve pitch and speed based on detected emotion
|
57 |
settings = emotion_settings.get(emotion, {"pitch": 1.0, "speed": 1.0})
|
58 |
-
# Generate speech with the TTS model
|
|
|
59 |
# We adjust the text to simulate the effect. This is a temporary solution.
|
60 |
-
# You might need to fine-tune these adjustments or consider a different TTS library
|
|
|
61 |
adjusted_text = text
|
62 |
if settings['speed'] > 1.0:
|
63 |
adjusted_text = adjusted_text.replace(" ", ".") # Simulate faster speech
|
@@ -68,7 +73,6 @@ def generate_emotional_speech(text, emotion):
|
|
68 |
audio_path = "output.wav" # Or any desired filename
|
69 |
tts_model.tts_to_file(text=adjusted_text, file_path=audio_path) # Pass file_path argument
|
70 |
return audio_path
|
71 |
-
|
72 |
|
73 |
# Example usage
|
74 |
emotion = "happy"
|
@@ -248,19 +252,7 @@ save_path = "/content/drive/My Drive/fine_tuned_tacotron2.pth"
|
|
248 |
# Save the model's state dictionary using torch.save
|
249 |
torch.save(model.state_dict(), save_path)
|
250 |
|
251 |
-
|
252 |
"""Set up the Gradio interface"""
|
253 |
-
import librosa
|
254 |
-
import soundfile as sf
|
255 |
-
|
256 |
-
def adjust_pitch(audio_path, pitch_factor):
|
257 |
-
# Load audio
|
258 |
-
y, sr = librosa.load(audio_path)
|
259 |
-
# Adjust pitch
|
260 |
-
y_shifted = librosa.effects.pitch_shift(y, sr, n_steps=pitch_factor)
|
261 |
-
# Save adjusted audio
|
262 |
-
sf.write(audio_path, y_shifted, sr)
|
263 |
-
|
264 |
|
265 |
import gradio as gr
|
266 |
from transformers import pipeline
|
@@ -274,17 +266,14 @@ tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
|
|
274 |
|
275 |
# Emotion-specific settings for pitch and speed
|
276 |
emotion_settings = {
|
277 |
-
"
|
278 |
-
"joy": {"pitch": 1.3, "speed": 1.2},
|
279 |
"sadness": {"pitch": 0.8, "speed": 0.9},
|
280 |
-
"anger": {"pitch": 1.
|
281 |
-
"fear": {"pitch":
|
282 |
-
"surprise": {"pitch": 1.
|
283 |
-
"
|
284 |
-
"shame": {"pitch": 0.8, "speed": 0.85},
|
285 |
}
|
286 |
|
287 |
-
|
288 |
# Function to process text or file input and generate audio
|
289 |
def emotion_aware_tts_pipeline(input_text=None, file_input=None):
|
290 |
try:
|
@@ -299,33 +288,24 @@ def emotion_aware_tts_pipeline(input_text=None, file_input=None):
|
|
299 |
emotion = emotion_data['label']
|
300 |
confidence = emotion_data['score']
|
301 |
|
302 |
-
# Adjust
|
303 |
settings = emotion_settings.get(emotion.lower(), {"pitch": 1.0, "speed": 1.0})
|
304 |
-
speed = settings["speed"]
|
305 |
pitch = settings["pitch"]
|
306 |
-
|
307 |
-
if speed > 1.0:
|
308 |
-
input_text = input_text.replace(" ", ". ") # Faster speech simulation
|
309 |
-
elif speed < 1.0:
|
310 |
-
input_text = input_text.replace(" ", "... ") # Slower speech simulation
|
311 |
|
312 |
# Generate audio
|
313 |
audio_path = "output.wav"
|
314 |
-
tts_model.tts_to_file(text=input_text, file_path=audio_path)
|
315 |
-
|
316 |
-
# Adjust pitch
|
317 |
-
pitch_factor = (pitch - 1.0) * 12 # Convert to semitones for librosa
|
318 |
-
adjust_pitch(audio_path, pitch_factor)
|
319 |
|
320 |
return f"Detected Emotion: {emotion} (Confidence: {confidence:.2f})", audio_path
|
321 |
else:
|
322 |
return "Please provide input text or file", None
|
323 |
except Exception as e:
|
|
|
324 |
return f"Error: {str(e)}", None
|
325 |
|
326 |
-
|
327 |
# Define Gradio interface
|
328 |
-
|
329 |
fn=emotion_aware_tts_pipeline,
|
330 |
inputs=[
|
331 |
gr.Textbox(label="Input Text", placeholder="Enter text here"),
|
@@ -340,4 +320,4 @@ interface = gr.Interface(
|
|
340 |
)
|
341 |
|
342 |
# Launch Gradio interface
|
343 |
-
|
|
|
3 |
|
4 |
"""Install Dependencies"""
|
5 |
|
6 |
+
pip install transformers librosa torch soundfile numba numpy TTS datasets gradio protobuf==3.20.3
|
7 |
|
8 |
"""Emotion Detection (Using Text Dataset)
|
9 |
|
10 |
"""
|
11 |
|
12 |
+
!pip install --upgrade numpy tensorflow transformers TTS
|
13 |
|
14 |
+
!pip freeze > requirements.txt
|
15 |
|
16 |
from transformers import pipeline
|
17 |
|
|
|
43 |
def generate_emotional_speech(text, emotion):
|
44 |
# Map emotion to voice modulation parameters (pitch, speed)
|
45 |
emotion_settings = {
|
46 |
+
"happy": {"pitch": 1.3, "speed": 1.2}, # Upbeat and energetic
|
47 |
+
"joy": {"pitch": 1.2, "speed": 1.1}, # Less exaggerated than 'happy'
|
48 |
+
"surprise": {"pitch": 1.5, "speed": 1.3}, # Excitement with high pitch and fast speech
|
49 |
+
"sad": {"pitch": 0.8, "speed": 0.9}, # Subdued, slow tone
|
50 |
+
"angry": {"pitch": 1.6, "speed": 1.4}, # Intense and sharp
|
51 |
+
"fear": {"pitch": 1.2, "speed": 0.95}, # Tense and slightly slow
|
52 |
+
"disgust": {"pitch": 0.9, "speed": 0.95}, # Low and deliberate
|
53 |
+
"shame": {"pitch": 0.8, "speed": 0.85}, # Quiet, subdued tone
|
54 |
+
"neutral": {"pitch": 1.0, "speed": 1.0}, # Baseline conversational tone
|
55 |
+
}
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
# Retrieve pitch and speed based on detected emotion
|
60 |
settings = emotion_settings.get(emotion, {"pitch": 1.0, "speed": 1.0})
|
61 |
+
# Generate speech with the TTS model
|
62 |
+
# Instead of directly passing speed and pitch to tts_to_file,
|
63 |
# We adjust the text to simulate the effect. This is a temporary solution.
|
64 |
+
# You might need to fine-tune these adjustments or consider a different TTS library
|
65 |
+
# with better control over speech parameters.
|
66 |
adjusted_text = text
|
67 |
if settings['speed'] > 1.0:
|
68 |
adjusted_text = adjusted_text.replace(" ", ".") # Simulate faster speech
|
|
|
73 |
audio_path = "output.wav" # Or any desired filename
|
74 |
tts_model.tts_to_file(text=adjusted_text, file_path=audio_path) # Pass file_path argument
|
75 |
return audio_path
|
|
|
76 |
|
77 |
# Example usage
|
78 |
emotion = "happy"
|
|
|
252 |
# Save the model's state dictionary using torch.save
|
253 |
torch.save(model.state_dict(), save_path)
|
254 |
|
|
|
255 |
"""Set up the Gradio interface"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
import gradio as gr
|
258 |
from transformers import pipeline
|
|
|
266 |
|
267 |
# Emotion-specific settings for pitch and speed
|
268 |
emotion_settings = {
|
269 |
+
"joy": {"pitch": 1.2, "speed": 1.1},
|
|
|
270 |
"sadness": {"pitch": 0.8, "speed": 0.9},
|
271 |
+
"anger": {"pitch": 1.0, "speed": 1.2},
|
272 |
+
"fear": {"pitch": 0.9, "speed": 1.0},
|
273 |
+
"surprise": {"pitch": 1.3, "speed": 1.2},
|
274 |
+
"neutral": {"pitch": 1.0, "speed": 1.0},
|
|
|
275 |
}
|
276 |
|
|
|
277 |
# Function to process text or file input and generate audio
|
278 |
def emotion_aware_tts_pipeline(input_text=None, file_input=None):
|
279 |
try:
|
|
|
288 |
emotion = emotion_data['label']
|
289 |
confidence = emotion_data['score']
|
290 |
|
291 |
+
# Adjust pitch and speed
|
292 |
settings = emotion_settings.get(emotion.lower(), {"pitch": 1.0, "speed": 1.0})
|
|
|
293 |
pitch = settings["pitch"]
|
294 |
+
speed = settings["speed"]
|
|
|
|
|
|
|
|
|
295 |
|
296 |
# Generate audio
|
297 |
audio_path = "output.wav"
|
298 |
+
tts_model.tts_to_file(text=input_text, file_path=audio_path, speed=speed, pitch=pitch)
|
|
|
|
|
|
|
|
|
299 |
|
300 |
return f"Detected Emotion: {emotion} (Confidence: {confidence:.2f})", audio_path
|
301 |
else:
|
302 |
return "Please provide input text or file", None
|
303 |
except Exception as e:
|
304 |
+
# Return error message if something goes wrong
|
305 |
return f"Error: {str(e)}", None
|
306 |
|
|
|
307 |
# Define Gradio interface
|
308 |
+
iface = gr.Interface(
|
309 |
fn=emotion_aware_tts_pipeline,
|
310 |
inputs=[
|
311 |
gr.Textbox(label="Input Text", placeholder="Enter text here"),
|
|
|
320 |
)
|
321 |
|
322 |
# Launch Gradio interface
|
323 |
+
iface.launch()
|