Spaces:

ales
/

ai-audio-books

Running

App Files Files Community

Aliaksandr commited on Oct 9, 2024

Commit

e9aad3f

unverified ·

2 Parent(s): 5dec512 93a309d

Merge pull request #1 from navalnica/feature/sound-and-emetion-generation

Browse files

Files changed (3) hide show

src/emotions/generation.py +148 -0
src/emotions/prompts.py +94 -0
src/emotions/utils.py +70 -0

src/emotions/generation.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import openai
+import json
+from requests import HTTPError
+from abc import ABC, abstractmethod
+from .prompts import SOUND_EFFECT_GENERATION, SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION, TEXT_MODIFICATION
+from .utils import get_audio_duration
+from src.config import logger
+class AbstractEffectGenerator(ABC):
+    @abstractmethod
+    def generate_text_for_sound_effect(self, text)-> dict:
+        pass
+    @abstractmethod
+    def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str)-> dict:
+        pass
+    @abstractmethod
+    def add_emotion_to_text(self, text: str) -> dict:
+        pass
+class EffectGenerator(AbstractEffectGenerator):
+    def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
+        self.client = openai.OpenAI(api_key=api_key)
+        self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
+        self.text_modification_prompt = TEXT_MODIFICATION
+        self.model_type = model_type
+        logger.info(f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}")
+    def generate_text_for_sound_effect(self, text: str) -> dict:
+        """Generate sound effect description and parameters based on input text."""
+        try:
+            completion = self.client.chat.completions.create(
+                model=self.model_type,
+                messages=[
+                    {"role": "system", "content": self.sound_effect_prompt},
+                    {"role": "user", "content": text}
+                ],
+                response_format={"type": "json_object"}
+            )
+            # Extracting the output
+            chatgpt_output = completion.choices[0].message.content
+            # Parse and return JSON response
+            output_dict = json.loads(chatgpt_output)
+            logger.info("Successfully generated sound effect description: %s", output_dict)
+            return output_dict
+        except json.JSONDecodeError as e:
+            logger.error("Failed to parse the output text as JSON: %s", e)
+            raise RuntimeError(f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}")
+        except HTTPError as e:
+            logger.error("HTTP error occurred: %s", e)
+            raise RuntimeError(f"HTTP Error: {e}")
+        except Exception as e:
+            logger.error("Unexpected error occurred: %s", e)
+            raise RuntimeError(f"Unexpected Error: {e}")
+    def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str = None)-> dict:
+        llm_output = self.generate_text_for_sound_effect(text)
+        if generated_audio_file is not None:
+            llm_output['duration_seconds'] = get_audio_duration(generated_audio_file)
+            logger.info("Added duration_seconds to output based on generated audio file: %s", generated_audio_file)
+        return llm_output
+    def add_emotion_to_text(self, text: str) -> dict:
+        completion = self.client.chat.completions.create(
+            model=self.model_type,
+            messages=[{"role": "system", "content": self.text_modification_prompt},
+                      {"role": "user", "content": text}],
+            response_format={"type": "json_object"}
+        )
+        chatgpt_output = completion.choices[0].message.content
+        try:
+            output_dict = json.loads(chatgpt_output)
+            logger.info("Successfully modified text with emotional cues: %s", output_dict)
+            return output_dict
+        except json.JSONDecodeError as e:
+            logger.error("Error in parsing the modified text: %s", e)
+            raise f"error, output_text: {chatgpt_output}"
+class EffectGeneratorAsync(AbstractEffectGenerator):
+    def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
+        self.client = openai.AsyncOpenAI(api_key=api_key)
+        self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
+        self.text_modification_prompt = TEXT_MODIFICATION
+        self.model_type = model_type
+    async def generate_text_for_sound_effect(self, text: str) -> dict:
+        """Asynchronous version to generate sound effect description."""
+        try:
+            completion = await self.client.chat.completions.create(
+                model=self.model_type,
+                messages=[
+                    {"role": "system", "content": self.sound_effect_prompt},
+                    {"role": "user", "content": text}
+                ],
+                response_format={"type": "json_object"}
+            )
+            # Extracting the output
+            chatgpt_output = completion.choices[0].message.content
+            # Parse and return JSON response
+            output_dict = json.loads(chatgpt_output)
+            logger.info("Successfully generated sound effect description: %s", output_dict)
+            return output_dict
+        except json.JSONDecodeError as e:
+            logger.error("Failed to parse the output text as JSON: %s", e)
+            raise RuntimeError(f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}")
+        except HTTPError as e:
+            logger.error("HTTP error occurred: %s", e)
+            raise RuntimeError(f"HTTP Error: {e}")
+        except Exception as e:
+            logger.error("Unexpected error occurred: %s", e)
+            raise RuntimeError(f"Unexpected Error: {e}")
+    async def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str) -> dict:
+        llm_output = await self.generate_text_for_sound_effect(text)
+        if generated_audio_file is not None:
+            llm_output['duration_seconds'] = get_audio_duration(generated_audio_file)
+            logger.info("Added duration_seconds to output based on generated audio file: %s", generated_audio_file)
+        return llm_output
+    async def add_emotion_to_text(self, text: str) -> dict:
+        completion = await self.client.chat.completions.create(
+            model=self.model_type,
+            messages=[{"role": "system", "content": self.text_modification_prompt},
+                      {"role": "user", "content": text}],
+            response_format={"type": "json_object"}
+        )
+        chatgpt_output = completion.choices[0].message.content
+        try:
+            output_dict = json.loads(chatgpt_output)
+            logger.info("Successfully modified text with emotional cues: %s", output_dict)
+            return output_dict
+        except json.JSONDecodeError as e:
+            logger.error("Error in parsing the modified text: %s", e)
+            raise f"error, output_text: {chatgpt_output}"

src/emotions/prompts.py ADDED Viewed

	@@ -0,0 +1,94 @@

+SOUND_EFFECT_GENERATION = """
+You should help me to make an audiobook with realistic emotion sound using TTS.
+You are tasked with generating a description of sound effects
+that matches the atmosphere, actions, and tone of a given sentence or text from a book.
+The description should be tailored to create a sound effect using ElevenLabs'sound generation API.
+The generated sound description must evoke the scene
+or emotions from the text (e.g., footsteps, wind, tense silence, etc.),
+and it should be succinct and fit the mood of the text.
+Additionally, you should include the following parameters in your response:
+    Text: A generated description of the sound that matches the text provided.
+        Keep the description simple and effective to capture the soundscape.
+        This text will be converted into a sound effect.
+    Duration_seconds: The appropriate duration of the sound effect,
+        which should be calculated based on the length and nature of the scene.
+        Cap this duration at 22 seconds. But be carefully, for very long text in input make a long sound effect,
+         for small make a small one. And the duration should be similar to duration of input text
+    Prompt_influence: A value between 0 and 1, where a higher value makes the sound generation closely
+        follow the sound description. For general sound effects (e.g., footsteps, background ambiance),
+        use a value around 0.3. For more specific or detailed sound scenes
+        (e.g., thunderstorm, battle sounds), use a higher value like 0.5 to 0.7.
+Your output should be in the following JSON format:
+{
+  "text": "A soft breeze rustling through leaves, distant birds chirping.",
+  "duration_seconds": 4.0,
+  "prompt_influence": 0.4
+}
+"""
+SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION = """
+You should help me to make an audiobook with realistic emotion sound using TTS.
+You are tasked with generating a description of sound effects
+that matches the atmosphere, actions, and tone of a given sentence or text from a book.
+The description should be tailored to create a sound effect using ElevenLabs'sound generation API.
+The generated sound description must evoke the scene
+or emotions from the text (e.g., footsteps, wind, tense silence, etc.),
+and it should be succinct and fit the mood of the text.
+Additionally, you should include the following parameters in your response:
+    Text: A generated description of the sound that matches the text provided.
+        Keep the description simple and effective to capture the soundscape.
+        This text will be converted into a sound effect.
+    Prompt_influence: A value between 0 and 1, where a higher value makes the sound generation closely
+        follow the sound description. For general sound effects (e.g., footsteps, background ambiance),
+        use a value around 0.3. For more specific or detailed sound scenes
+        (e.g., thunderstorm, battle sounds), use a higher value like 0.5 to 0.7.
+Your output should be in the following JSON format:
+{
+  "text": "A soft breeze rustling through leaves, distant birds chirping.",
+  "prompt_influence": 0.4
+}
+"""
+TEXT_MODIFICATION = """
+You should help me to make an audiobook with realistic emotion-based voice using TTS.
+You are tasked with adjusting the emotional tone of a given text
+by modifying the text with special characters such as "!", "...", "-", "~",
+and uppercase words to add emphasis or convey emotion. For adding more emotion u can
+duplicate special characters for example "!!!".
+Do not remove or add any different words.
+Only alter the presentation of the existing words.
+After modifying the text, adjust the "stability", "similarity_boost" and "style" parameters
+according to the level of emotional intensity in the modified text.
+Higher emotional intensity should lower the "stability" and raise the "similarity_boost".
+ Your output should be in the following JSON format:
+ {
+  "modified_text": "Modified text with emotional adjustments.",
+  "params": {
+    "stability": 0.7,
+    "similarity_boost": 0.5,
+    "style": 0.3
+  }
+}
+The "stability" parameter should range from 0 to 1,
+with lower values indicating a more expressive, less stable voice.
+The "similarity_boost" parameter should also range from 0 to 1,
+with higher values indicating more emphasis on the voice similarity.
+The "style" parameter should also range from 0 to 1,
+where lower values indicate a neutral tone and higher values reflect more stylized or emotional delivery.
+Adjust both according to the emotional intensity of the text.
+Example of text that could be passed:
+Text: "I can't believe this is happening."
+"""

src/emotions/utils.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from pydub import AudioSegment
+from pathlib import Path
+from elevenlabs import ElevenLabs, AsyncElevenLabs
+from elevenlabs import play, save
+from src.config import logger
+def get_audio_duration(filepath: str) -> float:
+    """
+    Returns the duration of the audio file in seconds.
+    :param filepath: Path to the audio file.
+    :return: Duration of the audio file in seconds.
+    """
+    audio = AudioSegment.from_file(filepath)
+    duration_in_seconds = len(audio) / 1000  # Convert milliseconds to seconds
+    return round(duration_in_seconds, 1)
+def add_overlay_for_audio(main_audio_filename: str,
+                          sound_effect_filename: str,
+                          output_filename: str = None,
+                          cycling_effect: bool = True,
+                          decrease_effect_volume: int = 0) -> str:
+    try:
+        main_audio = AudioSegment.from_file(main_audio_filename)
+        effect_audio = AudioSegment.from_file(sound_effect_filename)
+    except Exception as e:
+        raise RuntimeError(f"Error loading audio files: {e}")
+    if cycling_effect:
+        while len(effect_audio) < len(main_audio):
+            effect_audio += effect_audio
+    effect_audio = effect_audio[:len(main_audio)]
+    if decrease_effect_volume > 0:
+        effect_audio = effect_audio - decrease_effect_volume
+    combined_audio = main_audio.overlay(effect_audio)
+    if output_filename is None:
+        output_filename = f"{Path(main_audio_filename).stem}_{Path(sound_effect_filename).stem}.wav"
+    combined_audio.export(output_filename, format="wav")
+    return output_filename
+def sound_generation(sound_generation_data: dict, output_file: str):
+    client = ElevenLabs(
+        api_key="YOUR_API_KEY",
+    )
+    audio = client.text_to_sound_effects.convert(
+        text=sound_generation_data['text'],
+        duration_seconds=sound_generation_data['duration_seconds'],
+        prompt_influence=sound_generation_data['prompt_influence'],
+    )
+    save(audio, output_file)
+    logger.error("Successfully generated sound effect to file: %s", output_file)
+async def sound_generation_async(sound_generation_data: dict, output_file: str):
+    client = AsyncElevenLabs(
+        api_key="YOUR_API_KEY",
+    )
+    audio = await client.text_to_sound_effects.convert(
+        text=sound_generation_data['text'],
+        duration_seconds=sound_generation_data['duration_seconds'],
+        prompt_influence=sound_generation_data['prompt_influence'],
+    )
+    save(audio, output_file)
+    logger.error("Successfully generated sound effect to file: %s", output_file)