StyleTTS2_Studio / text2speech.py
Wismut's picture
initial commit
0af9841
raw
history blame
20.6 kB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import os
import argparse
import random
import string
import numpy as np
import soundfile as sf # Alias for clarity
import torch
import inference
from txtsplit import txtsplit # Import txtsplit
from typing import Optional, Tuple, List
VOICES_JSON_PATH = "voices.json" # Contains your known style vectors
RANDOM_VOICES_JSON_PATH = "random_voices.json" # We'll store newly sampled vectors here
##############################################################################
# JSON LOAD/SAVE
##############################################################################
def load_json(path: str) -> dict:
"""
Load existing style vectors from the given JSON file.
Additionally, validates that all style vectors have the same length.
Args:
path (str): Path to the JSON file.
Returns:
dict: Loaded JSON data.
"""
data = {}
if os.path.exists(path):
with open(path, "r") as f:
data = json.load(f)
# Verify all vectors have the same length
lengths = set(len(vec) for vec in data.values())
if len(lengths) > 1:
raise ValueError(
f"Inconsistent vector lengths found in '{path}': {lengths}. "
"All style vectors must have the same dimensionality."
)
print(f"Loaded {len(data)} style vectors from '{path}'.")
else:
print(f"No existing '{path}' found. Starting with an empty dictionary.")
return data
def save_json(data: dict, path: str) -> None:
"""
Save a dict of style vectors to the given JSON file.
Args:
data (dict): Data to save.
path (str): Path to the JSON file.
"""
with open(path, "w") as f:
json.dump(data, f, indent=2)
print(f"Saved {len(data)} style vectors to '{path}'.")
##############################################################################
# GAUSSIAN FIT AND SAMPLING
##############################################################################
def fit_gaussian_to_voices(voices_data: dict) -> Tuple[np.ndarray, np.ndarray]:
"""
Fit a Gaussian distribution (mean & cov) to the style vectors in 'voices_data'.
'voices_data' is a dict: { "key.wav": <list-of-floats>, ... }
Args:
voices_data (dict): Dictionary containing style vectors.
Returns:
Tuple[np.ndarray, np.ndarray]: Mean and covariance of the fitted Gaussian.
"""
all_vecs = []
for key, data in voices_data.items():
# Convert to array
arr = np.array(data, dtype=np.float32)
# Squeeze out any dimension of size 1
arr = np.squeeze(arr)
if arr.ndim == 1:
# It's shape (D,)
all_vecs.append(arr)
else:
# If still not 1D, we skip or warn
print(
f"Skipping '{key}' because shape is {arr.shape}, not 1D after squeeze."
)
# Must have at least 2 valid vectors to compute a meaningful covariance
if len(all_vecs) < 2:
raise ValueError(
"Need at least 2 valid style vectors to fit a Gaussian distribution.\n"
"Check that each entry is 1D (or (1,D) which can be squeezed)."
)
# Stack into (N, D)
mat = np.stack(all_vecs, axis=0) # shape => (N, D)
# Sanity check
if mat.ndim != 2:
raise ValueError("Style vectors must collectively form a 2D array (N, D).")
# Compute mean & covariance
mean = np.mean(mat, axis=0) # shape (D,)
cov = np.cov(mat, rowvar=False) # shape (D, D)
print("Fitted Gaussian distribution to style vectors.")
return mean, cov
def sample_random_style(mean: np.ndarray, cov: np.ndarray) -> torch.Tensor:
"""
Sample a random style vector from a Gaussian distribution.
Args:
mean (np.ndarray): Mean vector of the Gaussian.
cov (np.ndarray): Covariance matrix of the Gaussian.
Returns:
torch.Tensor: Sampled style vector as a tensor of shape (1, D).
"""
# Sample from multivariate normal distribution
z = np.random.multivariate_normal(mean, cov)
# Convert to torch tensor
style_tensor = torch.tensor(z, dtype=torch.float32)
# Unsqueeze to shape (1, D)
style_tensor = style_tensor.unsqueeze(0)
print(f"Sampled a new random style vector with shape {style_tensor.shape}.")
return style_tensor
##############################################################################
# UTILITIES
##############################################################################
def parse_speed(value) -> float:
"""
Convert 'value' into a float between 0.5 and 2.0 based on custom logic.
Examples:
parse_speed("120%") -> 1.2
parse_speed(0.3) -> 0.5 (clamped)
parse_speed(5) -> 2.0 (clamped)
parse_speed("100%") -> 1.0
parse_speed(1) -> 1.0
parse_speed(3) -> 2.0 (clamped)
parse_speed(50) -> 0.5
parse_speed(100) -> 1.0
parse_speed(130) -> 1.3
parse_speed("150") -> 1.5
"""
# 1) If string ends with '%', parse percentage
if isinstance(value, str):
value = value.strip()
if value.endswith("%"):
numeric_str = value[:-1].strip() # remove '%' suffix
try:
f = float(numeric_str)
except ValueError:
print(
f"Invalid speed format '{value}'. Falling back to default speed 1.0."
)
f = 100.0 # fallback to "100%" -> 1.0
speed = f / 100.0
else:
# It's a normal string; parse as float
try:
f = float(value)
except ValueError:
print(
f"Invalid speed format '{value}'. Falling back to default speed 1.0."
)
f = 100.0 # fallback to "100" -> 1.0
# If f >= 10, treat as f/100. Example: 50 -> 0.5, 150 -> 1.5
speed = f / 100.0 if f >= 10 else f
else:
# 2) If not string, parse as float
try:
f = float(value)
except ValueError:
print(f"Invalid speed value '{value}'. Falling back to default speed 1.0.")
f = 1.0 # fallback to 1.0
# If f >= 10, treat as f/100
speed = f / 100.0 if f >= 10 else f
# 3) Clamp to [0.5, 2.0]
clamped_speed = max(0.5, min(2.0, speed))
if clamped_speed != speed:
print(f"Speed {speed} clamped to {clamped_speed}.")
else:
print(f"Parsed speed: {clamped_speed}")
return clamped_speed
def concatenate_audios(audios: List[np.ndarray]) -> np.ndarray:
"""
Concatenate a list of NumPy audio arrays into a single array.
Args:
audios (List[np.ndarray]): List of audio waveforms to concatenate.
Returns:
np.ndarray: Concatenated audio waveform.
"""
return np.concatenate(audios, axis=0)
##############################################################################
# SYNTHESIS CORE FUNCTION
##############################################################################
def synthesize_audio(
text_chunks: List[str],
style_vec: torch.Tensor,
speed: float,
alpha: float = 0.3,
beta: float = 0.7,
diffusion_steps: int = 7,
embedding_scale: float = 1.0,
) -> Optional[np.ndarray]:
"""
Core function to synthesize audio from text chunks and a style vector.
Args:
text_chunks (List[str]): List of text segments to synthesize.
style_vec (torch.Tensor): Style vector tensor of shape (1, D).
speed (float): Parsed speed factor.
alpha (float): Alpha parameter for inference.
beta (float): Beta parameter for inference.
diffusion_steps (int): Number of diffusion steps for inference.
embedding_scale (float): Embedding scale parameter.
Returns:
Optional[np.ndarray]: Concatenated audio waveform, or None if synthesis fails.
"""
audios = []
for idx, chunk in enumerate(text_chunks, 1):
print(f"Synthesizing chunk {idx}/{len(text_chunks)}...")
audio_segment = inference.inference(
chunk,
style_vec,
alpha=alpha,
beta=beta,
diffusion_steps=diffusion_steps,
embedding_scale=embedding_scale,
speed=speed,
)
if audio_segment is not None:
audios.append(audio_segment)
print(f"Chunk {idx} synthesized successfully.")
else:
print(f"Inference returned None for text segment {idx}: {chunk[:30]}...")
if not audios:
print("No audio segments were generated.")
return None
# Concatenate all audio segments
print("Concatenating audio segments...")
full_audio = concatenate_audios(audios)
print(f"Concatenated audio length: {len(full_audio)} samples.")
return full_audio
##############################################################################
# TTS USING A RANDOMLY SAMPLED STYLE
##############################################################################
def tts_randomized(
text: str, speed: float = 1.2
) -> Tuple[Optional[np.ndarray], Optional[torch.Tensor]]:
"""
1) Loads style vectors from voices.json
2) Fits a Gaussian to those vectors
3) Samples a new style vector from that distribution
4) Saves it in random_voices.json
5) Synthesizes TTS using that random style, handling long texts.
Args:
text (str): The text to be synthesized.
speed (float): Speed of the generated audio.
Returns:
Tuple[Optional[np.ndarray], Optional[torch.Tensor]]: (audio_waveform, style_vector)
"""
# Load known style vectors from voices.json
voices_data = load_json(VOICES_JSON_PATH)
if not voices_data:
print(f"No data found in '{VOICES_JSON_PATH}'; cannot sample a random style.")
return None, None
# Fit Gaussian
try:
mean, cov = fit_gaussian_to_voices(voices_data)
except ValueError as e:
print(f"Error fitting Gaussian: {e}")
return None, None
# Sample new vector
random_style_tensor = sample_random_style(mean, cov)
# Optionally create a random key for storing
random_key = "random_" + "".join(random.choices(string.digits, k=6))
print(f"Generated random style key: '{random_key}'")
# Save in random_voices.json
random_voices_data = load_json(RANDOM_VOICES_JSON_PATH)
random_voices_data[random_key] = random_style_tensor.squeeze(0).tolist()
save_json(random_voices_data, RANDOM_VOICES_JSON_PATH)
print(
f"Saved random style vector to '{RANDOM_VOICES_JSON_PATH}' under key '{random_key}'."
)
# Parse speed
speed = parse_speed(speed)
# Split text into manageable chunks using txtsplit
print("Splitting text into chunks...")
text_chunks = txtsplit(text)
print(f"Text split into {len(text_chunks)} chunks.")
# Synthesize audio using the core function
full_audio = synthesize_audio(
text_chunks=text_chunks, style_vec=random_style_tensor, speed=speed
)
return full_audio, random_style_tensor
##############################################################################
# NORMAL (NON-RANDOM) TTS LOGIC
##############################################################################
def get_or_compute_style_vector(key_or_path: str, voices_data: dict) -> torch.Tensor:
"""
If key_or_path is in voices_data, load it.
If it's a file path, compute style from audio.
Otherwise, raise an error.
Args:
key_or_path (str): Voice key or file path.
voices_data (dict): Dictionary of existing style vectors.
Returns:
torch.Tensor: Style vector tensor of shape (1, D).
"""
if key_or_path in voices_data:
print(f"Found style vector for '{key_or_path}' in '{VOICES_JSON_PATH}'.")
style_vec = torch.tensor(voices_data[key_or_path], dtype=torch.float32)
elif os.path.isfile(key_or_path):
print(
f"No existing style for '{key_or_path}'. Attempting to compute from audio..."
)
style_vec = inference.compute_style(key_or_path)
if style_vec is None:
raise ValueError(f"Failed to compute style vector from '{key_or_path}'.")
voices_data[key_or_path] = style_vec.squeeze(0).tolist()
save_json(voices_data, VOICES_JSON_PATH)
print(
f"Computed and saved new style vector for '{key_or_path}' to '{VOICES_JSON_PATH}'."
)
else:
raise ValueError(
f"'{key_or_path}' not found in '{VOICES_JSON_PATH}' and is not a valid file path."
)
print(f"Original style vector shape: {style_vec.shape}")
# Ensure style_vec is 2D: (1, D)
if style_vec.dim() == 1:
style_vec = style_vec.unsqueeze(0)
print(f"Unsqueezed style vector to shape: {style_vec.shape}")
elif style_vec.dim() == 3:
style_vec = style_vec.squeeze(1)
print(f"Squeezed style vector to shape: {style_vec.shape}")
elif style_vec.dim() != 2:
raise ValueError(
f"Unexpected style vector dimensions: {style_vec.shape}. Expected 2D tensor."
)
print(f"Processed style vector shape: {style_vec.shape}")
return style_vec
def validate_style_vectors(voices_data: dict):
"""
Validates that all style vectors in voices_data have the same dimensionality.
Args:
voices_data (dict): Dictionary containing style vectors.
Raises:
ValueError: If inconsistent vector lengths are found.
"""
if not voices_data:
print("No style vectors to validate.")
return
lengths = set(len(vec) for vec in voices_data.values())
if len(lengths) > 1:
raise ValueError(
f"Inconsistent style vector lengths found: {lengths}. "
"All style vectors must have the same dimensionality."
)
print("All style vectors have consistent lengths.")
def tts_normal(text: str, voice: str, speed: float = 1.2) -> Optional[np.ndarray]:
"""
Load an existing style vector from voices.json if it exists and has 'voice'.
Otherwise, if 'voice' is a valid .wav file, compute its style vector
and store it. Finally, run normal TTS with the obtained style vector,
handling long texts.
Args:
text (str): The text to be synthesized.
voice (str): Either the key in voices.json or a .wav file path.
speed (float): Speed of the generated audio.
Returns:
Optional[np.ndarray]: Synthesized audio waveform, or None if something fails.
"""
# Load voices_data
try:
voices_data = load_json(VOICES_JSON_PATH)
validate_style_vectors(voices_data)
except ValueError as e:
print(f"Error loading/validating '{VOICES_JSON_PATH}': {e}")
return None
try:
style_vec = get_or_compute_style_vector(voice, voices_data)
except ValueError as e:
print(e)
return None
if style_vec is None:
print("No style vector found or computed; cannot run TTS.")
return None
# Parse speed
speed = parse_speed(speed)
# Split text into manageable chunks using txtsplit
print("Splitting text into chunks...")
text_chunks = txtsplit(text)
print(f"Text split into {len(text_chunks)} chunks.")
# Synthesize audio using the core function
full_audio = synthesize_audio(
text_chunks=text_chunks,
style_vec=style_vec,
speed=speed,
)
return full_audio
##############################################################################
# TTS USING A DIRECTLY PROVIDED STYLE VECTOR
##############################################################################
def tts_with_style_vector(
text: str,
style_vec: torch.Tensor,
speed: float = 1.2,
alpha: float = 0.3,
beta: float = 0.7,
diffusion_steps: int = 7,
embedding_scale: float = 1.0,
) -> Optional[np.ndarray]:
"""
Perform TTS synthesis using a *directly provided* style vector.
Args:
text (str): The text to be spoken.
style_vec (torch.Tensor): A PyTorch tensor representing the style vector.
Should be shape (1, D) if the pipeline expects a batch dimension.
speed (float): Speed factor for TTS. (Use parse_speed to handle fancy inputs.)
alpha (float): Weight for alpha in your inference function.
beta (float): Weight for beta in your inference function.
diffusion_steps (int): Number of diffusion steps for your TTS pipeline.
embedding_scale (float): Classifier-free guidance scale or similar.
Returns:
Optional[np.ndarray]: Synthesized audio waveform as a NumPy array (float32), or None if synthesis fails.
"""
# Ensure style_vec has shape (1, D)
if style_vec.dim() == 1:
style_vec = style_vec.unsqueeze(0) # e.g. (D,) -> (1, D)
print(f"Unsqueezed style vector to shape: {style_vec.shape}")
elif style_vec.dim() == 3:
style_vec = style_vec.squeeze(1)
print(f"Squeezed style vector to shape: {style_vec.shape}")
elif style_vec.dim() != 2:
print(f"Unexpected style vector shape: {style_vec.shape}. Expected 2D tensor.")
return None
print(f"Style vector shape for synthesis: {style_vec.shape}")
# Parse speed
speed_val = parse_speed(speed)
# Split text into manageable chunks using txtsplit
print("Splitting text into chunks...")
text_chunks = txtsplit(text)
print(f"Text split into {len(text_chunks)} chunks.")
# Synthesize audio using the core function
full_audio = synthesize_audio(
text_chunks=text_chunks,
style_vec=style_vec,
speed=speed_val,
alpha=alpha,
beta=beta,
diffusion_steps=diffusion_steps,
embedding_scale=embedding_scale,
)
return full_audio
##############################################################################
# MAIN CLI
##############################################################################
def main():
parser = argparse.ArgumentParser(
description="Script to TTS with either random style sampling or normal style usage."
)
parser.add_argument(
"--text",
type=str,
default="Hello from a random style or normal style TTS script!",
help="Text to be spoken.",
)
parser.add_argument(
"--speed",
type=str, # Changed to str to handle inputs like "120%"
default="1.2",
help="Speed of the generated audio (e.g., '120%', '1.2').",
)
parser.add_argument(
"--voice",
type=str,
default=None,
help="If not using --randomize, specify a voice key or .wav path to load/compute style.",
)
parser.add_argument(
"--randomize",
action="store_true",
help="Use random style sampling from a fitted Gaussian of known styles.",
)
parser.add_argument(
"--output", type=str, default="output.wav", help="Output WAV file name."
)
args = parser.parse_args()
if args.randomize:
# Approach: random style from distribution
print("Sampling a new random style vector from 'voices.json' distribution...")
audio, _ = tts_randomized(text=args.text, speed=args.speed)
else:
# Normal approach: use a style key or fallback
print("Using normal style approach (loading or computing from 'voices.json').")
if args.voice is None:
print("Error: --voice must be specified when not using --randomize.")
parser.print_help()
return
audio = tts_normal(text=args.text, voice=args.voice, speed=args.speed)
if audio is not None:
# Ensure audio is a NumPy array of type float32
if not isinstance(audio, np.ndarray):
print("Error: Synthesized audio is not a NumPy array.")
return
if audio.dtype != np.float32:
print(f"Converting audio from {audio.dtype} to float32.")
audio = audio.astype(np.float32)
# Save the concatenated audio
try:
sf.write(args.output, audio, 24000)
print(f"Audio saved to '{args.output}'.")
except Exception as e:
print(f"Failed to save audio to '{args.output}': {e}")
else:
print("No audio was generated. Check logs above for errors.")
if __name__ == "__main__":
main()