Aliaksandr commited on
Commit
e9aad3f
Β·
unverified Β·
2 Parent(s): 5dec512 93a309d

Merge pull request #1 from navalnica/feature/sound-and-emetion-generation

Browse files
src/emotions/generation.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import json
3
+ from requests import HTTPError
4
+ from abc import ABC, abstractmethod
5
+
6
+ from .prompts import SOUND_EFFECT_GENERATION, SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION, TEXT_MODIFICATION
7
+ from .utils import get_audio_duration
8
+ from src.config import logger
9
+
10
+
11
+ class AbstractEffectGenerator(ABC):
12
+ @abstractmethod
13
+ def generate_text_for_sound_effect(self, text)-> dict:
14
+ pass
15
+
16
+ @abstractmethod
17
+ def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str)-> dict:
18
+ pass
19
+
20
+ @abstractmethod
21
+ def add_emotion_to_text(self, text: str) -> dict:
22
+ pass
23
+
24
+ class EffectGenerator(AbstractEffectGenerator):
25
+ def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
26
+ self.client = openai.OpenAI(api_key=api_key)
27
+ self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
28
+ self.text_modification_prompt = TEXT_MODIFICATION
29
+ self.model_type = model_type
30
+ logger.info(f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}")
31
+
32
+ def generate_text_for_sound_effect(self, text: str) -> dict:
33
+ """Generate sound effect description and parameters based on input text."""
34
+ try:
35
+ completion = self.client.chat.completions.create(
36
+ model=self.model_type,
37
+ messages=[
38
+ {"role": "system", "content": self.sound_effect_prompt},
39
+ {"role": "user", "content": text}
40
+ ],
41
+ response_format={"type": "json_object"}
42
+ )
43
+ # Extracting the output
44
+ chatgpt_output = completion.choices[0].message.content
45
+
46
+ # Parse and return JSON response
47
+ output_dict = json.loads(chatgpt_output)
48
+ logger.info("Successfully generated sound effect description: %s", output_dict)
49
+ return output_dict
50
+
51
+ except json.JSONDecodeError as e:
52
+ logger.error("Failed to parse the output text as JSON: %s", e)
53
+ raise RuntimeError(f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}")
54
+
55
+ except HTTPError as e:
56
+ logger.error("HTTP error occurred: %s", e)
57
+ raise RuntimeError(f"HTTP Error: {e}")
58
+
59
+ except Exception as e:
60
+ logger.error("Unexpected error occurred: %s", e)
61
+ raise RuntimeError(f"Unexpected Error: {e}")
62
+
63
+ def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str = None)-> dict:
64
+ llm_output = self.generate_text_for_sound_effect(text)
65
+ if generated_audio_file is not None:
66
+ llm_output['duration_seconds'] = get_audio_duration(generated_audio_file)
67
+ logger.info("Added duration_seconds to output based on generated audio file: %s", generated_audio_file)
68
+ return llm_output
69
+
70
+ def add_emotion_to_text(self, text: str) -> dict:
71
+ completion = self.client.chat.completions.create(
72
+ model=self.model_type,
73
+ messages=[{"role": "system", "content": self.text_modification_prompt},
74
+ {"role": "user", "content": text}],
75
+ response_format={"type": "json_object"}
76
+ )
77
+ chatgpt_output = completion.choices[0].message.content
78
+ try:
79
+ output_dict = json.loads(chatgpt_output)
80
+ logger.info("Successfully modified text with emotional cues: %s", output_dict)
81
+ return output_dict
82
+ except json.JSONDecodeError as e:
83
+ logger.error("Error in parsing the modified text: %s", e)
84
+ raise f"error, output_text: {chatgpt_output}"
85
+
86
+
87
+ class EffectGeneratorAsync(AbstractEffectGenerator):
88
+ def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
89
+ self.client = openai.AsyncOpenAI(api_key=api_key)
90
+ self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
91
+ self.text_modification_prompt = TEXT_MODIFICATION
92
+ self.model_type = model_type
93
+
94
+ async def generate_text_for_sound_effect(self, text: str) -> dict:
95
+ """Asynchronous version to generate sound effect description."""
96
+ try:
97
+ completion = await self.client.chat.completions.create(
98
+ model=self.model_type,
99
+ messages=[
100
+ {"role": "system", "content": self.sound_effect_prompt},
101
+ {"role": "user", "content": text}
102
+ ],
103
+ response_format={"type": "json_object"}
104
+ )
105
+ # Extracting the output
106
+ chatgpt_output = completion.choices[0].message.content
107
+
108
+ # Parse and return JSON response
109
+ output_dict = json.loads(chatgpt_output)
110
+ logger.info("Successfully generated sound effect description: %s", output_dict)
111
+ return output_dict
112
+
113
+ except json.JSONDecodeError as e:
114
+ logger.error("Failed to parse the output text as JSON: %s", e)
115
+ raise RuntimeError(f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}")
116
+
117
+ except HTTPError as e:
118
+ logger.error("HTTP error occurred: %s", e)
119
+ raise RuntimeError(f"HTTP Error: {e}")
120
+
121
+ except Exception as e:
122
+ logger.error("Unexpected error occurred: %s", e)
123
+ raise RuntimeError(f"Unexpected Error: {e}")
124
+
125
+
126
+ async def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str) -> dict:
127
+ llm_output = await self.generate_text_for_sound_effect(text)
128
+ if generated_audio_file is not None:
129
+ llm_output['duration_seconds'] = get_audio_duration(generated_audio_file)
130
+ logger.info("Added duration_seconds to output based on generated audio file: %s", generated_audio_file)
131
+ return llm_output
132
+
133
+ async def add_emotion_to_text(self, text: str) -> dict:
134
+ completion = await self.client.chat.completions.create(
135
+ model=self.model_type,
136
+ messages=[{"role": "system", "content": self.text_modification_prompt},
137
+ {"role": "user", "content": text}],
138
+ response_format={"type": "json_object"}
139
+ )
140
+ chatgpt_output = completion.choices[0].message.content
141
+ try:
142
+ output_dict = json.loads(chatgpt_output)
143
+ logger.info("Successfully modified text with emotional cues: %s", output_dict)
144
+ return output_dict
145
+ except json.JSONDecodeError as e:
146
+ logger.error("Error in parsing the modified text: %s", e)
147
+ raise f"error, output_text: {chatgpt_output}"
148
+
src/emotions/prompts.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SOUND_EFFECT_GENERATION = """
2
+ You should help me to make an audiobook with realistic emotion sound using TTS.
3
+ You are tasked with generating a description of sound effects
4
+ that matches the atmosphere, actions, and tone of a given sentence or text from a book.
5
+ The description should be tailored to create a sound effect using ElevenLabs'sound generation API.
6
+ The generated sound description must evoke the scene
7
+ or emotions from the text (e.g., footsteps, wind, tense silence, etc.),
8
+ and it should be succinct and fit the mood of the text.
9
+
10
+ Additionally, you should include the following parameters in your response:
11
+
12
+ Text: A generated description of the sound that matches the text provided.
13
+ Keep the description simple and effective to capture the soundscape.
14
+ This text will be converted into a sound effect.
15
+ Duration_seconds: The appropriate duration of the sound effect,
16
+ which should be calculated based on the length and nature of the scene.
17
+ Cap this duration at 22 seconds. But be carefully, for very long text in input make a long sound effect,
18
+ for small make a small one. And the duration should be similar to duration of input text
19
+ Prompt_influence: A value between 0 and 1, where a higher value makes the sound generation closely
20
+ follow the sound description. For general sound effects (e.g., footsteps, background ambiance),
21
+ use a value around 0.3. For more specific or detailed sound scenes
22
+ (e.g., thunderstorm, battle sounds), use a higher value like 0.5 to 0.7.
23
+
24
+ Your output should be in the following JSON format:
25
+
26
+ {
27
+ "text": "A soft breeze rustling through leaves, distant birds chirping.",
28
+ "duration_seconds": 4.0,
29
+ "prompt_influence": 0.4
30
+ }
31
+
32
+ """
33
+
34
+ SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION = """
35
+ You should help me to make an audiobook with realistic emotion sound using TTS.
36
+ You are tasked with generating a description of sound effects
37
+ that matches the atmosphere, actions, and tone of a given sentence or text from a book.
38
+ The description should be tailored to create a sound effect using ElevenLabs'sound generation API.
39
+ The generated sound description must evoke the scene
40
+ or emotions from the text (e.g., footsteps, wind, tense silence, etc.),
41
+ and it should be succinct and fit the mood of the text.
42
+
43
+ Additionally, you should include the following parameters in your response:
44
+
45
+ Text: A generated description of the sound that matches the text provided.
46
+ Keep the description simple and effective to capture the soundscape.
47
+ This text will be converted into a sound effect.
48
+ Prompt_influence: A value between 0 and 1, where a higher value makes the sound generation closely
49
+ follow the sound description. For general sound effects (e.g., footsteps, background ambiance),
50
+ use a value around 0.3. For more specific or detailed sound scenes
51
+ (e.g., thunderstorm, battle sounds), use a higher value like 0.5 to 0.7.
52
+
53
+ Your output should be in the following JSON format:
54
+
55
+ {
56
+ "text": "A soft breeze rustling through leaves, distant birds chirping.",
57
+ "prompt_influence": 0.4
58
+ }
59
+
60
+ """
61
+
62
+ TEXT_MODIFICATION = """
63
+ You should help me to make an audiobook with realistic emotion-based voice using TTS.
64
+ You are tasked with adjusting the emotional tone of a given text
65
+ by modifying the text with special characters such as "!", "...", "-", "~",
66
+ and uppercase words to add emphasis or convey emotion. For adding more emotion u can
67
+ duplicate special characters for example "!!!".
68
+ Do not remove or add any different words.
69
+ Only alter the presentation of the existing words.
70
+ After modifying the text, adjust the "stability", "similarity_boost" and "style" parameters
71
+ according to the level of emotional intensity in the modified text.
72
+ Higher emotional intensity should lower the "stability" and raise the "similarity_boost".
73
+ Your output should be in the following JSON format:
74
+ {
75
+ "modified_text": "Modified text with emotional adjustments.",
76
+ "params": {
77
+ "stability": 0.7,
78
+ "similarity_boost": 0.5,
79
+ "style": 0.3
80
+ }
81
+ }
82
+
83
+ The "stability" parameter should range from 0 to 1,
84
+ with lower values indicating a more expressive, less stable voice.
85
+ The "similarity_boost" parameter should also range from 0 to 1,
86
+ with higher values indicating more emphasis on the voice similarity.
87
+ The "style" parameter should also range from 0 to 1,
88
+ where lower values indicate a neutral tone and higher values reflect more stylized or emotional delivery.
89
+ Adjust both according to the emotional intensity of the text.
90
+
91
+ Example of text that could be passed:
92
+
93
+ Text: "I can't believe this is happening."
94
+ """
src/emotions/utils.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydub import AudioSegment
2
+ from pathlib import Path
3
+ from elevenlabs import ElevenLabs, AsyncElevenLabs
4
+ from elevenlabs import play, save
5
+
6
+ from src.config import logger
7
+
8
+
9
+ def get_audio_duration(filepath: str) -> float:
10
+ """
11
+ Returns the duration of the audio file in seconds.
12
+
13
+ :param filepath: Path to the audio file.
14
+ :return: Duration of the audio file in seconds.
15
+ """
16
+ audio = AudioSegment.from_file(filepath)
17
+ duration_in_seconds = len(audio) / 1000 # Convert milliseconds to seconds
18
+ return round(duration_in_seconds, 1)
19
+
20
+
21
+ def add_overlay_for_audio(main_audio_filename: str,
22
+ sound_effect_filename: str,
23
+ output_filename: str = None,
24
+ cycling_effect: bool = True,
25
+ decrease_effect_volume: int = 0) -> str:
26
+ try:
27
+ main_audio = AudioSegment.from_file(main_audio_filename)
28
+ effect_audio = AudioSegment.from_file(sound_effect_filename)
29
+ except Exception as e:
30
+ raise RuntimeError(f"Error loading audio files: {e}")
31
+
32
+ if cycling_effect:
33
+ while len(effect_audio) < len(main_audio):
34
+ effect_audio += effect_audio
35
+
36
+ effect_audio = effect_audio[:len(main_audio)]
37
+
38
+ if decrease_effect_volume > 0:
39
+ effect_audio = effect_audio - decrease_effect_volume
40
+ combined_audio = main_audio.overlay(effect_audio)
41
+
42
+ if output_filename is None:
43
+ output_filename = f"{Path(main_audio_filename).stem}_{Path(sound_effect_filename).stem}.wav"
44
+ combined_audio.export(output_filename, format="wav")
45
+ return output_filename
46
+
47
+
48
+ def sound_generation(sound_generation_data: dict, output_file: str):
49
+ client = ElevenLabs(
50
+ api_key="YOUR_API_KEY",
51
+ )
52
+ audio = client.text_to_sound_effects.convert(
53
+ text=sound_generation_data['text'],
54
+ duration_seconds=sound_generation_data['duration_seconds'],
55
+ prompt_influence=sound_generation_data['prompt_influence'],
56
+ )
57
+ save(audio, output_file)
58
+ logger.error("Successfully generated sound effect to file: %s", output_file)
59
+
60
+ async def sound_generation_async(sound_generation_data: dict, output_file: str):
61
+ client = AsyncElevenLabs(
62
+ api_key="YOUR_API_KEY",
63
+ )
64
+ audio = await client.text_to_sound_effects.convert(
65
+ text=sound_generation_data['text'],
66
+ duration_seconds=sound_generation_data['duration_seconds'],
67
+ prompt_influence=sound_generation_data['prompt_influence'],
68
+ )
69
+ save(audio, output_file)
70
+ logger.error("Successfully generated sound effect to file: %s", output_file)