File size: 2,677 Bytes
3c72012
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import torchaudio

from whisperspeech.pipeline import Pipeline
import argparse

def parse_args():
    parser = argparse.ArgumentParser(description="Convert text to audio.")
    parser.add_argument(
        "--text",
        type=str,
        required=True,
        help="The text to convert to audio.",
    )
    return parser.parse_args()

def convert_text_to_audio(pipe: Pipeline, text: str):
    """Convert text to audio.

    Args:
        pipe (Pipeline): The pipeline to use for text-to-speech.
        text (str): The text to convert to audio.

    Returns:
        torch.Tensor: The generated audio.
    """
    return pipe.generate(text)


def convert_text_to_audio_file(pipe: Pipeline, text: str, output_path: str):
    """Convert text to audio and save it to a file.

    Args:
        pipe (Pipeline): The pipeline to use for text-to-speech.
        text (str): The text to convert to audio.
        output_path (str): The path to save the audio file.
    """
    pipe.generate_to_file(output_path, text)


class TTSProcessor:
    def __init__(self, device: str):
        """Initialize the TTS Processor with a specified device."""
        self.pipe = Pipeline(
            s2a_ref="collabora/whisperspeech:s2a-q4-tiny-en+pl.model", device=device
        )

    def get_reference_voice_embedding(self, path: str):
        """Get the reference voice embedding from the given audio file.

        Args:
            path (str): The path to the audio file.
        Returns:
            torch.Tensor: The reference voice embedding."""
        return self.pipe.extract_spk_emb(path).cpu()

    def convert_text_to_audio(self, text: str, speaker=None):
        """Convert text to audio.

        Args:
            text (str): The text to convert to audio.

        Returns:
            torch.Tensor: The generated audio.
        """
        return self.pipe.generate(text, speaker=speaker)

    def convert_text_to_audio_file(self, text: str, output_path: str, speaker=None):
        """Convert text to audio and save it to a file.

        Args:
            text (str): The text to convert to audio.
            output_path (str): The path to save the audio file.
        """
        self.pipe.generate_to_file(output_path, text, speaker=speaker)
if __name__ == "__main__":
    args = parse_args()
    processor = TTSProcessor("cuda")
    text = args.text
    text = text.lower()
    text_split = "_".join(text.lower().split(" "))  
    # remove the last character if it is a period
    if text_split[-1] == ".":
        text_split = text_split[:-1]
    print(text_split)
    path = f"./examples/{text_split}.wav"
    processor.convert_text_to_audio_file(text, path)