Spaces:

ales
/

ai-audio-books

Running

App Files Files Community

navalnica commited on Oct 11, 2024

Commit

95849c2

1 Parent(s): 8797a8a

stability: use semaphore for tts text LLM preprocessing; retry tts_astream;

Browse files

Files changed (11) hide show

.gitignore +1 -0
README.md +22 -8
data/11labs_available_tts_voices.reviewed.csv +35 -0
data/samples_to_split.py +142 -0
src/audio_generators.py +53 -27
src/builder.py +2 -4
src/config.py +4 -0
src/emotions/generation.py +102 -93
src/select_voice_chain.py +11 -2
src/tts.py +9 -0
src/utils.py +2 -2

.gitignore CHANGED Viewed

@@ -5,5 +5,6 @@ venv
 .DS_Store
 data/books
 .env

 .DS_Store
 data/books
+data/audiobooks
 .env

README.md CHANGED Viewed

@@ -11,7 +11,9 @@ python_version: 3.11
 ---
 ### Action items
-- check new set of voices
 - intonations
     - add context
 - audio effects
@@ -19,14 +21,26 @@ python_version: 3.11
     - filter, apply only for long phrases
     - only for narrator?
     - checkbox! make effects great again (no) optional
-- limit on input text size
-- fix UI
-- slides / story
 - optimizations
     - combine sequential phrases of same character in single phrase
     - support large texts. use batching. problem: how to ensure same characters?
-- can detect characters in first prompt, then split text in each batch into character phrases
-    - probably split large phrases into smaller ones
-    - identify unknown characters
-    - use LLM to recognize characters for a given text and provide descriptions detailed enough to select appropriate voice

 ---
 ### Action items
+- voices
+    - filter to use only best voices
 - intonations
     - add context
 - audio effects
     - filter, apply only for long phrases
     - only for narrator?
     - checkbox! make effects great again (no) optional
+- stability
+    - add limit on input text size (5000 chars)
+- improve UI
+    - add error box
+    - add samples
+    - show character parts
+    - remove file upload pane
+    - labels on how long to wait
+    - labels describing components
+    - header and description
+- prepare slides / story
+- testing
+    - eval current execution time
+    - test on different text inputs
 - optimizations
+    - generate audio effects asynchronously
     - combine sequential phrases of same character in single phrase
     - support large texts. use batching. problem: how to ensure same characters?
+    - can detect characters in first prompt, then split text in each batch into character phrases
+        - probably split large phrases into smaller ones
+        - identify unknown characters
+        - use LLM to recognize characters for a given text and provide descriptions detailed enough to select appropriate voice

data/11labs_available_tts_voices.reviewed.csv ADDED Viewed

	@@ -0,0 +1,35 @@

+voice_id,name,preview_url,manual_quality_review,owner_id,permission_on_resource,is_legacy,is_mixed,accent,description,age,gender,category,language,descriptive
+8opUN7sGOKbyojnjvNdl,Angela,https://storage.googleapis.com/eleven-public-prod/KI49vYhSgygxHVkjNTjMnJEaS6H2/voices/V8OwijRGPjSNuaZo64vg/6923fbb2-a713-4d42-a590-b8999b74f644.mp3,ok,,admin,FALSE,FALSE,american,,middle_aged,female,conversational,,confident
+yu4eXTP5aod8KAQzTI3T,"Claudia - Credible, Competent & Authentic",https://storage.googleapis.com/eleven-public-prod/database/user/Db6dCWW3pFVb0DPq7Q5s86VmuNQ2/voices/yu4eXTP5aod8KAQzTI3T/8zgX5ljCJ8kmBL0Q8rx7.mp3,ok,,admin,FALSE,FALSE,american,,middle_aged,female,conversational,en,confident
+wL9XWq1lRS2ZxgYArGOm,Freya - Concise and Wellspoken,https://storage.googleapis.com/eleven-public-prod/ofJ3ST4rvZcJUjpc7t5B7gqcCz93/voices/eWgG9MAbD9umBchcWP1B/cebbd3e7-2460-43d7-8ae3-d20160bc3e44.mp3,medium,,admin,FALSE,FALSE,american,,middle_aged,female,conversational,,formal
+GUM0hIboeFNpFw8Le7kY,Amber,https://storage.googleapis.com/eleven-public-prod/Eq8tyIYOx5PoyXO0aMbKV4xG1FW2/voices/XX9E8l5jsdMf8kszvsyX/b8a2aa2c-e3dc-42d0-b943-a2eb5d760315.mp3,bad,,admin,FALSE,FALSE,american,,middle_aged,female,conversational,,calm
+MS8BX0O8omm5Ie6mL8v2,Kay - 65 years old,https://storage.googleapis.com/eleven-public-prod/8UfhnmuNoPSOxBcoJqW3NsegvUG2/voices/r8SplNeU9vfxsNrJFstn/823adabf-a99a-4537-ac95-90e2ee315d2d.mp3,ok,,admin,FALSE,FALSE,british,,old,female,conversational,,calm
+NNYB2AxILPYTs53mGYij,Jenn,https://storage.googleapis.com/eleven-public-prod/63YWf9FcbyS5PpTayiUUxnQ95Cj2/voices/QxTS99N2QuIZNwTkGB5M/518786bf-ee01-400e-a690-c75193fee06a.mp3,bad,,admin,FALSE,FALSE,american,,old,female,conversational,,serious
+RigKMMlryPP9jHT50Jzl,Sybil,https://storage.googleapis.com/eleven-public-prod/LPSMwp0QPIPEu76c5UE6a2Tg6kU2/voices/bRp2SymzfuLvibautVoX/eb9ad599-df42-4447-8d85-b19ac1333018.mp3,bad,,admin,FALSE,FALSE,australian,,old,female,conversational,,casual
+v921MVi0ixhdc1CW2WxF,Aspexia - Grand & Clear,https://storage.googleapis.com/eleven-public-prod/lA5gT7FFPLVt66XJyVjpVR1ljPA2/voices/ZV3F48j5cMfp5ZlnpCs4/eb367cb4-5e11-4cbd-8fbe-c417bf21ffd3.mp3,ok,,admin,FALSE,FALSE,american,,old,female,conversational,,casual
+yxHHj4hKJKKTyvUpnbds,Heather - Mature British lady,https://storage.googleapis.com/eleven-public-prod/28U448fBuufmkz5lMBQX6JLm8sw2/voices/f3SgjPzocHWjSn3VXeKy/c9a06d6e-50fa-4ce3-9e7f-f6fd8c252898.mp3,ok,,admin,FALSE,FALSE,british,,old,female,conversational,,mature
+cgSgspJ2msm6clMCkdW9,Jessica,https://storage.googleapis.com/eleven-public-prod/premade/voices/cgSgspJ2msm6clMCkdW9/56a97bf8-b69b-448f-846c-c3a11683d45a.mp3,ok,,,FALSE,FALSE,american,expressive,young,female,conversational,,
+1btZhL2wthuOhUqvI0bB,Emily  - pleasant teen voice,https://storage.googleapis.com/eleven-public-prod/opnveFdsOUSqSDvVg1e9RtN4lWG2/voices/tZ7CAYevCBDjnoUrMFli/430cc09c-257d-41cd-acdd-5177e05a193a.mp3,ok,,admin,FALSE,FALSE,british,,young,female,conversational,,pleasant
+K5DRk4s8l1HFKsggS25u,Adrianna,https://storage.googleapis.com/eleven-public-prod/63YWf9FcbyS5PpTayiUUxnQ95Cj2/voices/ZWxdaecYtNJxT2ist5K7/201af84a-5d00-4c9f-999b-625bb8dab137.mp3,ok,,admin,FALSE,FALSE,australian,,young,female,conversational,,pleasant
+PoHUWWWMHFrA8z7Q88pu,Miranda,https://storage.googleapis.com/eleven-public-prod/ZXqeDajThsaYR96DSywtY94M8Dy1/voices/PoHUWWWMHFrA8z7Q88pu/403a094a-c4e4-431f-b96a-7c63ae76508d.mp3,ok,,admin,FALSE,FALSE,american,,young,female,conversational,en,cute
+Qo4wB0XcrmHFXFyP38mc,Sasha - Soothing and Chill,https://storage.googleapis.com/eleven-public-prod/u9529nqLipaEPA53CKpxKF7nqLH2/voices/mnp8rIZc6Kiem1xaJDaM/95b33e4a-e41b-4fda-9eb8-cab3039bce60.mp3,ok,,admin,FALSE,FALSE,american,,young,female,conversational,,chill
+U0gWepDCHKmmexbBxeSX,Alicia - Black Female,https://storage.googleapis.com/eleven-public-prod/laura1cGHOTJh1Ah99tjAoTLy7z1/voices/NBcGlQxeT5lFe7hgzwTR/f69f1f70-26a0-403c-915d-1992b6d8cfd0.mp3,ok,,admin,FALSE,FALSE,american,,young,female,conversational,,crisp
+rfkTsdZrVWEVhDycUYn9,Shelby,https://storage.googleapis.com/eleven-public-prod/database/user/amWos2WP7hRs0yPBtA0OwjuzHAH2/voices/rfkTsdZrVWEVhDycUYn9/iiFLm9bEQ83kiP0bExhG.mp3,ok,,admin,FALSE,FALSE,british,,young,female,conversational,en,pleasant
+teAOBFSeynXfbyNgq6Ec,Ally - Curious and Chill,https://storage.googleapis.com/eleven-public-prod/ofJ3ST4rvZcJUjpc7t5B7gqcCz93/voices/mW6Z7SZeRqOvuzCTgxyk/2132b3d1-fdcf-4036-85c9-bdd003941130.mp3,bad,,admin,FALSE,FALSE,american,,young,female,conversational,,chill
+IKne3meq5aSn9XLyUdCD,Charlie,https://storage.googleapis.com/eleven-public-prod/premade/voices/IKne3meq5aSn9XLyUdCD/102de6f2-22ed-43e0-a1f1-111fa75c5481.mp3,ok,,,FALSE,FALSE,australian,natural,middle_aged,male,conversational,,
+cjVigY5qzO86Huf0OWal,Eric,https://storage.googleapis.com/eleven-public-prod/premade/voices/cjVigY5qzO86Huf0OWal/d098fda0-6456-4030-b3d8-63aa048c9070.mp3,medium,,,FALSE,FALSE,american,friendly,middle_aged,male,conversational,,
+BFUk567oZITYKwOqegEq,Riley - loud and intense,https://storage.googleapis.com/eleven-public-prod/UwDtqCF44YaL77wxb8DVQlHT5Gp1/voices/60G0VdAP3WBQQbE6tSkT/ecc00def-2543-4b50-b93d-5d4b6c7dca33.mp3,very bad,,admin,FALSE,FALSE,american,,middle_aged,male,conversational,,intense
+EkuRA6XL9UbflTWEtNbQ,Middle age Southern Male,https://storage.googleapis.com/eleven-public-prod/0gh9bWjaVmNOvQJVcRddxeYIS2z1/voices/t5Oo3tZSuEZt6BD2VGV4/5c0177c5-46bd-414c-abfd-6cd6d5677f08.mp3,medium,,admin,FALSE,FALSE,american,,middle_aged,male,conversational,,casual
+MP7UPhn7eVWqCGJGIh6Q,Aaron Patrick - Fun-Upbeat,https://storage.googleapis.com/eleven-public-prod/database/user/ktIm5hvnGlc2TVlwOiZmbmw9kHy2/voices/MP7UPhn7eVWqCGJGIh6Q/NFiMZncqQJ0IFTzFGbwQ.mp3,ok,,admin,FALSE,FALSE,american,,middle_aged,male,conversational,en,upbeat
+RPEIZnKMqlQiZyZd1Dae,Christopher - friendly guy next door,https://storage.googleapis.com/eleven-public-prod/database/user/HURZYaLa4shZEqiT75qd5tyEsSr1/voices/RPEIZnKMqlQiZyZd1Dae/FwLtZ4mCBHV0eLjbUM8Y.mp3,ok,,admin,FALSE,FALSE,american,,middle_aged,male,conversational,en,casual
+Tx7VLgfksXHVnoY6jDGU,"Conversational Joe - A chatty casual voice, British RP male",https://storage.googleapis.com/eleven-public-prod/database/user/wf6Rmje05ZbqeHYfK82ThsPKouC2/voices/Tx7VLgfksXHVnoY6jDGU/ab4X4F9RcNSeTwBS8KS9.mp3,ok,,admin,FALSE,FALSE,british,,middle_aged,male,conversational,en,casual
+WLKp2jV6nrS8aMkPPDRO,Paul - Australian Professional Presenter,https://storage.googleapis.com/eleven-public-prod/custom/voices/WLKp2jV6nrS8aMkPPDRO/eaTiwieru6XA3d8ocVFK.mp3,ok,,admin,FALSE,FALSE,australian,,middle_aged,male,conversational,en,professional
+r5Al4oLBZaQ3SY9ncbVG,"Kass - Energetic, Casual, Engaging",https://storage.googleapis.com/eleven-public-prod/R51RaBhjNqbqLhP6cvEePomgTZX2/voices/r5Al4oLBZaQ3SY9ncbVG/18422b0c-f056-467c-997e-6a13854baea6.mp3,ok,,admin,FALSE,FALSE,american,,middle_aged,male,conversational,en,excited
+NWQswDS4aKfvBwYabvGT,Călin OZ,https://storage.googleapis.com/eleven-public-prod/FJwaL91a8SZIMeJz4Nx1ea8F1N63/voices/fi2r0fw0bQUShZBKDyrI/ba51edb4-2776-410b-aee9-597a25ec7492.mp3,bad,,admin,FALSE,FALSE,american,,old,male,conversational,,wise
+QT02s1mEPA02kfid1nTk,"Donald - American, 70 years old",https://storage.googleapis.com/eleven-public-prod/8UfhnmuNoPSOxBcoJqW3NsegvUG2/voices/MIer4gjk6bcEMGSRRkOs/06d81bc4-5cfa-4f46-b269-6cb655a45ad9.mp3,ok,,admin,FALSE,FALSE,american,,old,male,conversational,,intense
+eAyDL3LKBhCxNF21PbuQ,Mike - teacher,https://storage.googleapis.com/eleven-public-prod/ZNTWuC7IYQQWqzzeKbpLS0FeplI3/voices/Au1xn1q4XOiitEswfIzo/56b03843-3723-4617-a486-bbedb0f84828.mp3,bad,,admin,FALSE,FALSE,american,,old,male,conversational,,pleasant
+fQpAewJVtu0DoZkQ58bW,Old man with a soft voice,https://storage.googleapis.com/eleven-public-prod/uRcoDhI6DVf8aT39RswuDWK2C0P2/voices/xDBqezbIeYil8jkpc3c0/d2771c6e-0383-4050-bfa8-84a963baa098.mp3,bad,,admin,FALSE,FALSE,american,,old,male,conversational,,calm
+z3CSZSXDkv7fvwqfOrZN,Bryan,https://storage.googleapis.com/eleven-public-prod/Y3yb5Fdg8aVDi9n3MLjYB1v0tMF2/voices/0HpbeTZr9DdE1r0eGVn1/4c4cd4a7-c2ea-45d3-b402-55fc875240e0.mp3,medium,,admin,FALSE,FALSE,american,,old,male,conversational,,confident
+1RVpBInY9YUYMLSUQReV,Cody McAvoy,https://storage.googleapis.com/eleven-public-prod/EzfotYSdpJdL97IDcxmmjJAFhrB3/voices/1RVpBInY9YUYMLSUQReV/f4807088-b6ff-48fa-83ca-a060fe724d10.mp3,ok,,admin,FALSE,FALSE,american,,young,male,conversational,en,pleasant
+IFtWI8YHaBY8I7EB217u,ADAM,https://storage.googleapis.com/eleven-public-prod/21EHOpCpDMOQhKNMRI94rNioulW2/voices/4KEs1SFpz8x6fuT7MB5B/e0992f5a-2528-4a2a-981c-0470ccb0e935.mp3,bad,,admin,FALSE,FALSE,american,,young,male,conversational,,pleasant
+xSI29a9HAKdsWv3idXSN,Ali,https://storage.googleapis.com/eleven-public-prod/Je2OUP5SdgSZhG40J7umHz4bhXB3/voices/IlNTfQEoyv1KrV6Jbb0c/a4ba8772-1400-4910-adee-9e0dd65e2b74.mp3,ok,,admin,FALSE,FALSE,american,,young,male,conversational,,intense

data/samples_to_split.py CHANGED Viewed

@@ -96,3 +96,145 @@ up on us and first thing you know—”
 “Don’t believe everything you hear, Nick,” he advised me.
 """

 “Don’t believe everything you hear, Nick,” he advised me.
 """
+WONDERFUL_CHRISTMAS_1 = """\
+“Did the "Snow" go down? Who knows?” she asked, sobbing and crying, and
+quite broken down by hearing the fact put into words.
+“I’m sorry we said a word,” said Kate, crying with Mrs. Dobson; and,
+dropping down upon a footstool, she laid her head in Mrs. Dobson’s lap.
+“You will please forgive us, won’t you?” spoke up the boy in the bed.
+“Forgive you! Yes, indeed. I’m glad you thought enough about my
+poor—about Captain Dobson to talk of him at all, and to-day, too! Why, I
+always go down the harbor to-day; it is just thirty-seven years ago
+to-day since we were married and the ship went on its voyage.”
+“And you will go this afternoon, won’t you?” questioned Kate.
+“I mustn’t,” she said simply.
+“Why not?”
+“Why, I’ve got a little boy of my own to look after to-day, don’t you
+see?” she said, suddenly smiling, with an uplifted face.
+"""
+# lots of narrator speech
+WONDERFUL_CHRISTMAS_2 = """\
+Harry Cornwall ran out from the tent to watch the flight of Frank
+Hallock, who ran, frightened by the tiger, and as the boy disappeared
+from sight over the hilltop, Harry determined to try and find time
+enough that very evening to run up to the big house and tell all he knew
+about the necklace that somebody had tucked into the pocket of the
+waistcoat that fell to Jack Flibbit after the great fire. Harry could
+not persuade himself that two little girls in the land had put two
+necklaces, with the same mark, into pockets to go “Out West.” Harry’s
+name was called in a loud tone, and he ran to obey the call at about the
+same moment that Frank Hallock reached the cornfield and picked up his
+hoe.
+Frank counted the hills and the rows, and scarcely looked up until the
+sound of the one o’clock train, on the New Haven Railroad, passing
+through the town, told to him how short the hour had been.
+Then Neptune came home. Frank knew that Kate was on the carriage-road
+that ran past the field, but he would not look up, not even when he
+heard her cheery call to him; so Neptune and Kate went on their way, and
+presently the welcome sound of the dinner-horn was heard.
+Frank did not throw down his hoe, but fell to work harder than ever. In
+five minutes’ time the horn was sounded again, and on looking up, Frank
+beheld Kate standing on the veranda—she was waving her hat to attract
+his attention. He was inclined to make a martyr of himself just then, so
+he waved his straw hat in return, and immediately resolved to “hoe away
+like a major.”
+In fifteen minutes more Kate was making her way for the second time that
+day through Frank’s cornfield. As she drew near, she called out, “Frank!
+Frank! why in the world don’t you come to dinner? There is a gentleman
+at table who came to see papa on business, and I ran away after the
+soup—I couldn’t eat my dinner one bit, without you.”
+“You’ll _have_ to, I reckon,” returned Frank; “a poor fellow, like me,
+who has to hoe corn all day, can’t stop to eat.”
+“O, Frank Hallock! _for shame!_” cried Kate, putting down her indignant
+foot without being able to make noise enough about it to disturb an
+earthworm.
+“It’s _true_,” responded Frank, pitching into the next hill with all his
+might.
+“It is _not_ true,” cried Kate; “and if just running off to _look_ at
+the circus pass by makes you say such things, I am glad you can’t go to
+see it.”\
+"""
+ARCH_WIKI_1 = """\
+This document is an annotated index of popular articles and important information for improving and adding functionalities to the installed Arch system. Readers are assumed to have read and followed the Installation guide to obtain a basic Arch Linux installation. Having read and understood the concepts explained in #System administration and #Package management is required for following the other sections of this page and the other articles in the wiki.
+System administration
+This section deals with administrative tasks and system management. See Core utilities and Category:System administration for more.
+Users and groups
+A new installation leaves you with only the superuser account, better known as "root". Logging in as root for prolonged periods of time, possibly even exposing it via SSH on a server, is insecure. Instead, you should create and use unprivileged user account(s) for most tasks, only using the root account for system administration. See Users and groups#User management for details.
+Users and groups are a mechanism for access control; administrators may fine-tune group membership and ownership to grant or deny users and services access to system resources. Read the Users and groups article for details and potential security risks.
+Security
+Read Security for recommendations and best practices on hardening the system.
+For a list of applications to allow running commands or starting an interactive shell as another user (e.g. root), see List of applications/Security#Privilege elevation.
+Service management
+Arch Linux uses systemd as the init process, which is a system and service manager for Linux. For maintaining your Arch Linux installation, it is a good idea to learn the basics about it.
+Interaction with systemd is done through the systemctl command. See systemd#Basic systemctl usage for more information.
+A logging system is also provided, with the command journalctl. See journal for more information.
+System maintenance
+Arch is a rolling release system and has rapid package turnover, so users have to take some time to do system maintenance.
+Package management
+This section contains helpful information related to package management. See FAQ#Package management and Category:Package management for more.
+Note: It is imperative to keep up to date with changes in Arch Linux that require manual intervention before upgrading your system. Subscribe to the arch-announce mailing list or the recent news RSS feed. Alternatively, check the front page Arch news every time before you update.
+pacman
+pacman is the Arch Linux package manager: it is highly encouraged to become familiar with it before reading any other articles.
+To enable downloading packages in parallel, see pacman#Enabling parallel downloads.
+For long term handling of cached packages, see pacman#Cleaning the package cache.
+See pacman/Tips and tricks for suggestions on how to improve your interaction with pacman and package management in general.
+Repositories
+See the Official repositories article for details about the purpose of each officially maintained repository.
+If you plan on using 32-bit applications, you will want to enable the multilib repository.
+The Unofficial user repositories article lists several other unsupported repositories.
+You may consider installing the pkgstats service.
+Mirrors
+Visit the Mirrors article for steps on taking full advantage of using the fastest and most up to date mirrors of the official repositories. As explained in the article, a particularly good advice is to routinely check the Mirror Status page for a list of mirrors that have been recently synced. This can be automated with Reflector.
+Arch Build System
+Ports is a system initially used by BSD distributions consisting of build scripts that reside in a directory tree on the local system. Simply put, each port contains a script within a directory intuitively named after the installable third-party application.
+The Arch build system offers the same functionality by providing build scripts called PKGBUILDs, which are populated with information for a given piece of software: integrity hashes, project URL, version, license and build instructions. These PKGBUILDs are parsed by makepkg, the actual program that generates packages that are cleanly manageable by pacman.
+Every package in the repositories along with those present in the AUR are subject to recompilation with makepkg.
+Arch User Repository
+While the Arch Build System allows the ability of building software available in the official repositories, the Arch User Repository (AUR) is the equivalent for user submitted packages. It is an unsupported repository of build scripts accessible through the web interface or through the Aurweb RPC interface.
+Booting
+This section contains information pertaining to the boot process. An overview of the Arch boot process can be found at Arch boot process. See Category:Boot process for more.
+Hardware auto-recognition
+"""

src/audio_generators.py CHANGED Viewed

@@ -9,11 +9,14 @@ from langchain_community.callbacks import get_openai_callback
 from pydub import AudioSegment
 from src.lc_callbacks import LCMessageLoggerAsync
-from src.tts import tts_astream, sound_generation_astream
 from src.utils import auto_retry, consume_aiter
-from src.emotions.generation import EffectGeneratorAsync
 from src.emotions.utils import add_overlay_for_audio
-from src.config import ELEVENLABS_MAX_PARALLEL, logger
 from src.text_split_chain import SplitTextOutput
@@ -28,8 +31,8 @@ class AudioGeneratorSimple:
         async def tts_astream_with_semaphore(voice_id: str, text: str):
             async with semaphore:
-                iter_ = tts_astream(voice_id=voice_id, text=text)
-                bytes_ = await consume_aiter(iter_)
                 return bytes_
         tasks = []
@@ -77,12 +80,10 @@ class AudioGeneratorWithEffects:
         )
         logger.info(f"{generate_effects = }, {lines_for_sound_effect = }")
-        # Step 1: Process and modify text
-        modified_texts, sound_emotion_results = await self._process_and_modify_text(
             text_split, lines_for_sound_effect
         )
-        # Step 2: Generate TTS audio for modified text
         tts_results, self.temp_files = await self._generate_tts_audio(
             text_split, modified_texts, character_to_voice
         )
@@ -93,8 +94,12 @@ class AudioGeneratorWithEffects:
         )
         # Step 4: Merge audio files
-        normalized_audio_chunks = self._normalize_audio_chunks(audio_chunks, self.temp_files)
-        final_output = self._merge_audio_files(normalized_audio_chunks, save_path=out_path)
         # Clean up temporary files
         self._cleanup_temp_files(self.temp_files)
@@ -105,34 +110,51 @@ class AudioGeneratorWithEffects:
         """Select % of the lines randomly for sound effect generation."""
         return random.sample(range(num_lines), k=int(fraction * num_lines))
-    async def _process_and_modify_text(
         self, text_split: SplitTextOutput, lines_for_sound_effect: list[int]
     ) -> tuple[list[dict], list[dict]]:
-        """Process the text by modifying it and generating tasks for sound effects."""
-        tasks_for_text_modification = []
-        sound_emotion_tasks = []
         for idx, character_phrase in enumerate(text_split.phrases):
             character_text = character_phrase.text.strip().lower()
-            # Add text emotion modification task
-            tasks_for_text_modification.append(
-                self.effect_generator.add_emotion_to_text(character_text)
             )
             # If this line needs sound effects, generate parameters
             if idx in lines_for_sound_effect:
-                sound_emotion_tasks.append(
-                    self.effect_generator.generate_parameters_for_sound_effect(
-                        character_text
                     )
                 )
-        # Await tasks for text modification and sound effects
-        modified_texts = await asyncio.gather(*tasks_for_text_modification)
-        sound_emotion_results = await asyncio.gather(*sound_emotion_tasks)
-        return modified_texts, sound_emotion_results
     async def _generate_tts_audio(
         self,
@@ -146,8 +168,10 @@ class AudioGeneratorWithEffects:
         async def tts_astream_with_semaphore(voice_id: str, text: str, params: dict):
             async with self.semaphore:
-                iter_ = tts_astream(voice_id=voice_id, text=text, params=params)
-                bytes_ = await consume_aiter(iter_)
                 return bytes_
         for idx, (modified_text, character_phrase) in enumerate(
@@ -240,7 +264,9 @@ class AudioGeneratorWithEffects:
         return normalized_files
-    def _merge_audio_files(self, audio_filenames: list[str], save_path: Path | None = None) -> Path:
         """Helper function to merge multiple audio files into one."""
         combined = AudioSegment.from_file(audio_filenames[0])
         for filename in audio_filenames[1:]:

 from pydub import AudioSegment
 from src.lc_callbacks import LCMessageLoggerAsync
+from src.tts import tts_astream_consumed, sound_generation_astream
 from src.utils import auto_retry, consume_aiter
+from src.emotions.generation import (
+    EffectGeneratorAsync,
+    TextPreparationForTTSTaskOutput,
+)
 from src.emotions.utils import add_overlay_for_audio
+from src.config import ELEVENLABS_MAX_PARALLEL, logger, OPENAI_MAX_PARALLEL
 from src.text_split_chain import SplitTextOutput
         async def tts_astream_with_semaphore(voice_id: str, text: str):
             async with semaphore:
+                bytes_ = await tts_astream_consumed(voice_id=voice_id, text=text)
+                # bytes_ = await consume_aiter(iter_)
                 return bytes_
         tasks = []
         )
         logger.info(f"{generate_effects = }, {lines_for_sound_effect = }")
+        modified_texts, sound_emotion_results = await self._prepare_text_for_tts(
             text_split, lines_for_sound_effect
         )
         tts_results, self.temp_files = await self._generate_tts_audio(
             text_split, modified_texts, character_to_voice
         )
         )
         # Step 4: Merge audio files
+        normalized_audio_chunks = self._normalize_audio_chunks(
+            audio_chunks, self.temp_files
+        )
+        final_output = self._merge_audio_files(
+            normalized_audio_chunks, save_path=out_path
+        )
         # Clean up temporary files
         self._cleanup_temp_files(self.temp_files)
         """Select % of the lines randomly for sound effect generation."""
         return random.sample(range(num_lines), k=int(fraction * num_lines))
+    async def _prepare_text_for_tts(
         self, text_split: SplitTextOutput, lines_for_sound_effect: list[int]
     ) -> tuple[list[dict], list[dict]]:
+        semaphore = asyncio.Semaphore(OPENAI_MAX_PARALLEL)
+        async def run_task_with_semaphore(func, **params):
+            async with semaphore:
+                outputs = await func(**params)
+                return outputs
+        task_emotion_code = "add_emotion"
+        task_effects_code = "add_effects"
+        tasks = []
         for idx, character_phrase in enumerate(text_split.phrases):
             character_text = character_phrase.text.strip().lower()
+            tasks.append(
+                run_task_with_semaphore(
+                    func=self.effect_generator.add_emotion_to_text,
+                    text=character_text,
+                )
             )
             # If this line needs sound effects, generate parameters
             if idx in lines_for_sound_effect:
+                tasks.append(
+                    run_task_with_semaphore(
+                        func=self.effect_generator.generate_parameters_for_sound_effect,
+                        text=character_text,
                     )
                 )
+        tasks_results: list[TextPreparationForTTSTaskOutput] = []
+        tasks_results = await asyncio.gather(*tasks)
+        emotion_tasks_results = [
+            x.output for x in tasks_results if x.task == task_emotion_code
+        ]
+        effects_tasks_results = [
+            x.output for x in tasks_results if x.task == task_effects_code
+        ]
+        return emotion_tasks_results, effects_tasks_results
     async def _generate_tts_audio(
         self,
         async def tts_astream_with_semaphore(voice_id: str, text: str, params: dict):
             async with self.semaphore:
+                bytes_ = await tts_astream_consumed(
+                    voice_id=voice_id, text=text, params=params
+                )
+                # bytes_ = await consume_aiter(iter_)
                 return bytes_
         for idx, (modified_text, character_phrase) in enumerate(
         return normalized_files
+    def _merge_audio_files(
+        self, audio_filenames: list[str], save_path: Path | None = None
+    ) -> Path:
         """Helper function to merge multiple audio files into one."""
         combined = AudioSegment.from_file(audio_filenames[0])
         for filename in audio_filenames[1:]:

src/builder.py CHANGED Viewed

@@ -9,10 +9,8 @@ from src.utils import GPTModels
 class AudiobookBuilder:
-    def __init__(self) -> None:
-        self.voice_selector = VoiceSelector(
-            csv_table_fp="data/11labs_available_tts_voices.csv"
-        )
         self.audio_generator = AudioGeneratorWithEffects()
     async def split_text(self, text: str) -> SplitTextOutput:

 class AudiobookBuilder:
+    def __init__(self):
+        self.voice_selector = VoiceSelector()
         self.audio_generator = AudioGeneratorWithEffects()
     async def split_text(self, text: str) -> SplitTextOutput:

src/config.py CHANGED Viewed

@@ -12,4 +12,8 @@ ELEVENLABS_API_KEY = os.environ["ELEVEN_LABS_API_KEY"]
 FILE_SIZE_MAX = 0.5  # in mb
 ELEVENLABS_MAX_PARALLEL = 15  # current limitation of available subscription

 FILE_SIZE_MAX = 0.5  # in mb
+OPENAI_MAX_PARALLEL = 8  # empirically set
 ELEVENLABS_MAX_PARALLEL = 15  # current limitation of available subscription
+# VOICES_CSV_FP = "data/11labs_available_tts_voices.csv"
+VOICES_CSV_FP = "data/11labs_available_tts_voices.reviewed.csv"

src/emotions/generation.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import json
 from abc import ABC, abstractmethod
 import openai
 from requests import HTTPError
 from src.config import OPENAI_API_KEY, logger
@@ -16,105 +18,110 @@ from .prompts import (
 from .utils import get_audio_duration
 class AbstractEffectGenerator(ABC):
     @abstractmethod
-    def generate_text_for_sound_effect(self, text) -> dict:
         pass
     @abstractmethod
-    def generate_parameters_for_sound_effect(
-        self, text: str, generated_audio_file: str
-    ) -> dict:
         pass
     @abstractmethod
-    def add_emotion_to_text(self, text: str) -> dict:
         pass
-class EffectGenerator(AbstractEffectGenerator):
-    def __init__(self, predict_duration: bool = True, model_type: str = "gpt-4o"):
-        self.client = openai.OpenAI(api_key=OPENAI_API_KEY)
-        self.sound_effect_prompt = (
-            SOUND_EFFECT_GENERATION
-            if predict_duration
-            else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
-        )
-        self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
-        self.model_type = model_type
-        logger.info(
-            f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}"
-        )
-    @auto_retry
-    def generate_text_for_sound_effect(self, text: str) -> dict:
-        """Generate sound effect description and parameters based on input text."""
-        try:
-            completion = self.client.chat.completions.create(
-                model=self.model_type,
-                messages=[
-                    {"role": "system", "content": self.sound_effect_prompt},
-                    {"role": "user", "content": text},
-                ],
-                response_format={"type": "json_object"},
-            )
-            # Extracting the output
-            chatgpt_output = completion.choices[0].message.content
-            # Parse and return JSON response
-            output_dict = json.loads(chatgpt_output)
-            logger.info(
-                "Successfully generated sound effect description: %s", output_dict
-            )
-            return output_dict
-        except json.JSONDecodeError as e:
-            logger.error("Failed to parse the output text as JSON: %s", e)
-            raise RuntimeError(
-                f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}"
-            )
-        except HTTPError as e:
-            logger.error("HTTP error occurred: %s", e)
-            raise RuntimeError(f"HTTP Error: {e}")
-        except Exception as e:
-            logger.error("Unexpected error occurred: %s", e)
-            raise RuntimeError(f"Unexpected Error: {e}")
-    @auto_retry
-    def generate_parameters_for_sound_effect(
-        self, text: str, generated_audio_file: str = None
-    ) -> dict:
-        llm_output = self.generate_text_for_sound_effect(text)
-        if generated_audio_file is not None:
-            llm_output["duration_seconds"] = get_audio_duration(generated_audio_file)
-            logger.info(
-                "Added duration_seconds to output based on generated audio file: %s",
-                generated_audio_file,
-            )
-        return llm_output
-    @auto_retry
-    def add_emotion_to_text(self, text: str) -> dict:
-        completion = self.client.chat.completions.create(
-            model=self.model_type,
-            messages=[
-                {"role": "system", "content": self.text_modification_prompt},
-                {"role": "user", "content": text},
-            ],
-            response_format={"type": "json_object"},
-        )
-        chatgpt_output = completion.choices[0].message.content
-        try:
-            output_dict = json.loads(chatgpt_output)
-            logger.info(
-                "Successfully modified text with emotional cues: %s", output_dict
-            )
-            return output_dict
-        except json.JSONDecodeError as e:
-            logger.error("Error in parsing the modified text: %s", e)
-            raise f"error, output_text: {chatgpt_output}"
 class EffectGeneratorAsync(AbstractEffectGenerator):
@@ -166,8 +173,8 @@ class EffectGeneratorAsync(AbstractEffectGenerator):
     @auto_retry
     async def generate_parameters_for_sound_effect(
-        self, text: str, generated_audio_file: str = None
-    ) -> dict:
         llm_output = await self.generate_text_for_sound_effect(text)
         if generated_audio_file is not None:
             llm_output["duration_seconds"] = get_audio_duration(generated_audio_file)
@@ -175,10 +182,10 @@ class EffectGeneratorAsync(AbstractEffectGenerator):
                 "Added duration_seconds to output based on generated audio file: %s",
                 generated_audio_file,
             )
-        return llm_output
     @auto_retry
-    async def add_emotion_to_text(self, text: str) -> dict:
         completion = await self.client.chat.completions.create(
             model=self.model_type,
             messages=[
@@ -193,7 +200,9 @@ class EffectGeneratorAsync(AbstractEffectGenerator):
             logger.info(
                 "Successfully modified text with emotional cues: %s", output_dict
             )
-            return output_dict
         except json.JSONDecodeError as e:
             logger.error("Error in parsing the modified text: %s", e)
             raise f"error, output_text: {chatgpt_output}"

 import json
+import typing as t
 from abc import ABC, abstractmethod
 import openai
+from pydantic import BaseModel
 from requests import HTTPError
 from src.config import OPENAI_API_KEY, logger
 from .utils import get_audio_duration
+class TextPreparationForTTSTaskOutput(BaseModel):
+    task: str
+    output: t.Any
 class AbstractEffectGenerator(ABC):
     @abstractmethod
+    async def generate_text_for_sound_effect(self, text) -> dict:
         pass
     @abstractmethod
+    async def generate_parameters_for_sound_effect(
+        self, text: str, generated_audio_file: str | None
+    ) -> TextPreparationForTTSTaskOutput:
         pass
     @abstractmethod
+    async def add_emotion_to_text(self, text: str) -> TextPreparationForTTSTaskOutput:
         pass
+# class EffectGenerator(AbstractEffectGenerator):
+#     def __init__(self, predict_duration: bool = True, model_type: str = "gpt-4o"):
+#         self.client = openai.OpenAI(api_key=OPENAI_API_KEY)
+#         self.sound_effect_prompt = (
+#             SOUND_EFFECT_GENERATION
+#             if predict_duration
+#             else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
+#         )
+#         self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
+#         self.model_type = model_type
+#         logger.info(
+#             f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}"
+#         )
+#     @auto_retry
+#     def generate_text_for_sound_effect(self, text: str) -> dict:
+#         """Generate sound effect description and parameters based on input text."""
+#         try:
+#             completion = self.client.chat.completions.create(
+#                 model=self.model_type,
+#                 messages=[
+#                     {"role": "system", "content": self.sound_effect_prompt},
+#                     {"role": "user", "content": text},
+#                 ],
+#                 response_format={"type": "json_object"},
+#             )
+#             # Extracting the output
+#             chatgpt_output = completion.choices[0].message.content
+#             # Parse and return JSON response
+#             output_dict = json.loads(chatgpt_output)
+#             logger.info(
+#                 "Successfully generated sound effect description: %s", output_dict
+#             )
+#             return output_dict
+#         except json.JSONDecodeError as e:
+#             logger.error("Failed to parse the output text as JSON: %s", e)
+#             raise RuntimeError(
+#                 f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}"
+#             )
+#         except HTTPError as e:
+#             logger.error("HTTP error occurred: %s", e)
+#             raise RuntimeError(f"HTTP Error: {e}")
+#         except Exception as e:
+#             logger.error("Unexpected error occurred: %s", e)
+#             raise RuntimeError(f"Unexpected Error: {e}")
+#     @auto_retry
+#     def generate_parameters_for_sound_effect(
+#         self, text: str, generated_audio_file: str = None
+#     ) -> dict:
+#         llm_output = self.generate_text_for_sound_effect(text)
+#         if generated_audio_file is not None:
+#             llm_output["duration_seconds"] = get_audio_duration(generated_audio_file)
+#             logger.info(
+#                 "Added duration_seconds to output based on generated audio file: %s",
+#                 generated_audio_file,
+#             )
+#         return llm_output
+#     @auto_retry
+#     def add_emotion_to_text(self, text: str) -> dict:
+#         completion = self.client.chat.completions.create(
+#             model=self.model_type,
+#             messages=[
+#                 {"role": "system", "content": self.text_modification_prompt},
+#                 {"role": "user", "content": text},
+#             ],
+#             response_format={"type": "json_object"},
+#         )
+#         chatgpt_output = completion.choices[0].message.content
+#         try:
+#             output_dict = json.loads(chatgpt_output)
+#             logger.info(
+#                 "Successfully modified text with emotional cues: %s", output_dict
+#             )
+#             return output_dict
+#         except json.JSONDecodeError as e:
+#             logger.error("Error in parsing the modified text: %s", e)
+#             raise f"error, output_text: {chatgpt_output}"
 class EffectGeneratorAsync(AbstractEffectGenerator):
     @auto_retry
     async def generate_parameters_for_sound_effect(
+        self, text: str, generated_audio_file: str | None = None
+    ) -> TextPreparationForTTSTaskOutput:
         llm_output = await self.generate_text_for_sound_effect(text)
         if generated_audio_file is not None:
             llm_output["duration_seconds"] = get_audio_duration(generated_audio_file)
                 "Added duration_seconds to output based on generated audio file: %s",
                 generated_audio_file,
             )
+        return TextPreparationForTTSTaskOutput(task="add_effects", output=llm_output)
     @auto_retry
+    async def add_emotion_to_text(self, text: str) -> TextPreparationForTTSTaskOutput:
         completion = await self.client.chat.completions.create(
             model=self.model_type,
             messages=[
             logger.info(
                 "Successfully modified text with emotional cues: %s", output_dict
             )
+            return TextPreparationForTTSTaskOutput(
+                task="add_emotion", output=output_dict
+            )
         except json.JSONDecodeError as e:
             logger.error("Error in parsing the modified text: %s", e)
             raise f"error, output_text: {chatgpt_output}"

src/select_voice_chain.py CHANGED Viewed

@@ -13,6 +13,7 @@ from pydantic import BaseModel
 from src.config import logger
 from src.prompts import CharacterVoicePropertiesPrompt
 from src.utils import GPTModels, get_chat_llm
 class Property(StrEnum):
@@ -55,13 +56,21 @@ class VoiceSelector:
         Property.age_group: {"young", "middle_aged", "old"},
     }
-    def __init__(self, csv_table_fp: str):
-        self.df = self.read_data_table(csv_table_fp=csv_table_fp)
     def read_data_table(self, csv_table_fp: str):
         logger.info(f'reading voice data from: "{csv_table_fp}"')
         df = pd.read_csv(csv_table_fp)
         df["age"] = df["age"].str.replace(" ", "_").str.replace("-", "_")
         return df
     def get_available_properties_str(self, prop: Property):

 from src.config import logger
 from src.prompts import CharacterVoicePropertiesPrompt
 from src.utils import GPTModels, get_chat_llm
+from src.config import VOICES_CSV_FP
 class Property(StrEnum):
         Property.age_group: {"young", "middle_aged", "old"},
     }
+    def __init__(self):
+        self.df = self.read_data_table(csv_table_fp=VOICES_CSV_FP)
     def read_data_table(self, csv_table_fp: str):
         logger.info(f'reading voice data from: "{csv_table_fp}"')
         df = pd.read_csv(csv_table_fp)
+        logger.info(f"{df.shape=}")
         df["age"] = df["age"].str.replace(" ", "_").str.replace("-", "_")
+        if "manual_quality_review" in df.columns:
+            logger.info('filtering df by "manual_quality_review" column')
+            ix_to_drop = df[df["manual_quality_review"].isin(["very bad"])].index
+            df.drop(index=ix_to_drop, inplace=True)
+            logger.info(f"df.shape after filtering voices: {df.shape}")
         return df
     def get_available_properties_str(self, prop: Property):

src/tts.py CHANGED Viewed

@@ -7,6 +7,7 @@ from elevenlabs import VoiceSettings
 load_dotenv()
 from src.config import logger, ELEVENLABS_API_KEY
 ELEVEN_CLIENT = ElevenLabs(api_key=ELEVENLABS_API_KEY)
@@ -45,6 +46,14 @@ async def tts_astream(
             yield chunk
 async def sound_generation_astream(
     sound_generation_data: dict,
 ) -> t.AsyncIterator[bytes]:

 load_dotenv()
 from src.config import logger, ELEVENLABS_API_KEY
+from src.utils import auto_retry
 ELEVEN_CLIENT = ElevenLabs(api_key=ELEVENLABS_API_KEY)
             yield chunk
+@auto_retry
+async def tts_astream_consumed(
+    voice_id: str, text: str, params: dict | None = None
+) -> list[bytes]:
+    aiterator = tts_astream(voice_id=voice_id, text=text, params=params)
+    return [x async for x in aiterator]
 async def sound_generation_astream(
     sound_generation_data: dict,
 ) -> t.AsyncIterator[bytes]:

src/utils.py CHANGED Viewed

@@ -28,7 +28,7 @@ async def consume_aiter(aiterator):
 def auto_retry(f):
     decorator = retry(
-        wait=wait_random_exponential(min=1, max=5),
-        stop=stop_after_attempt(6),
     )
     return decorator(f)

 def auto_retry(f):
     decorator = retry(
+        wait=wait_random_exponential(min=2, max=6),
+        stop=stop_after_attempt(10),
     )
     return decorator(f)