File size: 1,374 Bytes
8035662
 
 
 
 
 
3e2e994
 
 
 
 
 
 
8035662
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9043fad
8035662
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from kokoro import KPipeline
import soundfile as sf
import numpy as np
import logging


pipeline = KPipeline(lang_code="a")
try:
    pipeline = pipeline.to("cuda")
except:
    logging.warning("CUDA not available, using CPU")


def generate_audio(
    text,
    voice="af_heart",
    speed=1,
    save_segments=False,
    progress=None,
):
    """
    Generate audio from text using Kokoro TTS pipeline

    Args:
        text (str): Text to convert to speech
        lang_code (str): Language code for the TTS model
        voice (str): Voice ID to use
        speed (float): Speech speed multiplier
        save_segments (bool): Whether to save individual audio segments

    Returns:
        numpy.ndarray: Combined audio data at 24kHz sample rate
    """
    generator = pipeline(text, voice=voice, speed=speed, split_pattern=r"\.")

    all_audio = []
    segments = list(generator)  # Get total number of segments

    for i, (gs, ps, audio) in enumerate(
        progress.tqdm(segments, desc="Generating audio")
    ):
        logging.info("Processing segment")
        logging.info(f"Graphemes: {gs}")
        logging.info(f"Phonemes: {ps}")
        all_audio.append(audio)

        if save_segments:
            sf.write(f"segment_{i}.wav", audio, 24000)

    # Concatenate all audio segments
    combined_audio = np.concatenate(all_audio)
    return combined_audio