Spaces:
Running
Running
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import json | |
import os | |
import argparse | |
import random | |
import string | |
import numpy as np | |
import soundfile as sf # Alias for clarity | |
import torch | |
import inference | |
from txtsplit import txtsplit # Import txtsplit | |
from typing import Optional, Tuple, List | |
VOICES_JSON_PATH = "voices.json" # Contains your known style vectors | |
RANDOM_VOICES_JSON_PATH = "random_voices.json" # We'll store newly sampled vectors here | |
############################################################################## | |
# DEVICE CONFIGURATION | |
############################################################################## | |
# Detect if CUDA is available and set the device accordingly | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print(f"Using device: {device}") | |
############################################################################## | |
# JSON LOAD/SAVE | |
############################################################################## | |
def load_json(path: str) -> dict: | |
""" | |
Load existing style vectors from the given JSON file. | |
Additionally, validates that all style vectors have the same length. | |
Args: | |
path (str): Path to the JSON file. | |
Returns: | |
dict: Loaded JSON data. | |
""" | |
data = {} | |
if os.path.exists(path): | |
with open(path, "r") as f: | |
data = json.load(f) | |
# Verify all vectors have the same length | |
lengths = set(len(vec) for vec in data.values()) | |
if len(lengths) > 1: | |
raise ValueError( | |
f"Inconsistent vector lengths found in '{path}': {lengths}. " | |
"All style vectors must have the same dimensionality." | |
) | |
print(f"Loaded {len(data)} style vectors from '{path}'.") | |
else: | |
print(f"No existing '{path}' found. Starting with an empty dictionary.") | |
return data | |
def save_json(data: dict, path: str) -> None: | |
""" | |
Save a dict of style vectors to the given JSON file. | |
Args: | |
data (dict): Data to save. | |
path (str): Path to the JSON file. | |
""" | |
with open(path, "w") as f: | |
json.dump(data, f, indent=2) | |
print(f"Saved {len(data)} style vectors to '{path}'.") | |
############################################################################## | |
# GAUSSIAN FIT AND SAMPLING | |
############################################################################## | |
def fit_gaussian_to_voices(voices_data: dict) -> Tuple[np.ndarray, np.ndarray]: | |
""" | |
Fit a Gaussian distribution (mean & cov) to the style vectors in 'voices_data'. | |
'voices_data' is a dict: { "key.wav": <list-of-floats>, ... } | |
Args: | |
voices_data (dict): Dictionary containing style vectors. | |
Returns: | |
Tuple[np.ndarray, np.ndarray]: Mean and covariance of the fitted Gaussian. | |
""" | |
all_vecs = [] | |
for key, data in voices_data.items(): | |
# Convert to array | |
arr = np.array(data, dtype=np.float32) | |
# Squeeze out any dimension of size 1 | |
arr = np.squeeze(arr) | |
if arr.ndim == 1: | |
# It's shape (D,) | |
all_vecs.append(arr) | |
else: | |
# If still not 1D, we skip or warn | |
print( | |
f"Skipping '{key}' because shape is {arr.shape}, not 1D after squeeze." | |
) | |
# Must have at least 2 valid vectors to compute a meaningful covariance | |
if len(all_vecs) < 2: | |
raise ValueError( | |
"Need at least 2 valid style vectors to fit a Gaussian distribution.\n" | |
"Check that each entry is 1D (or (1,D) which can be squeezed)." | |
) | |
# Stack into (N, D) | |
mat = np.stack(all_vecs, axis=0) # shape => (N, D) | |
# Sanity check | |
if mat.ndim != 2: | |
raise ValueError("Style vectors must collectively form a 2D array (N, D).") | |
# Compute mean & covariance | |
mean = np.mean(mat, axis=0) # shape (D,) | |
cov = np.cov(mat, rowvar=False) # shape (D, D) | |
print("Fitted Gaussian distribution to style vectors.") | |
return mean, cov | |
def sample_random_style(mean: np.ndarray, cov: np.ndarray) -> torch.Tensor: | |
""" | |
Sample a random style vector from a Gaussian distribution. | |
Args: | |
mean (np.ndarray): Mean vector of the Gaussian. | |
cov (np.ndarray): Covariance matrix of the Gaussian. | |
Returns: | |
torch.Tensor: Sampled style vector as a tensor of shape (1, D). | |
""" | |
# Sample from multivariate normal distribution | |
z = np.random.multivariate_normal(mean, cov) | |
# Convert to torch tensor | |
style_tensor = torch.tensor(z, dtype=torch.float32).to(device) # Move to device | |
# Unsqueeze to shape (1, D) | |
style_tensor = style_tensor.unsqueeze(0) | |
print(f"Sampled a new random style vector with shape {style_tensor.shape}.") | |
return style_tensor | |
############################################################################## | |
# UTILITIES | |
############################################################################## | |
def parse_speed(value) -> float: | |
""" | |
Convert 'value' into a float between 0.5 and 2.0 based on custom logic. | |
Examples: | |
parse_speed("120%") -> 1.2 | |
parse_speed(0.3) -> 0.5 (clamped) | |
parse_speed(5) -> 2.0 (clamped) | |
parse_speed("100%") -> 1.0 | |
parse_speed(1) -> 1.0 | |
parse_speed(3) -> 2.0 (clamped) | |
parse_speed(50) -> 0.5 | |
parse_speed(100) -> 1.0 | |
parse_speed(130) -> 1.3 | |
parse_speed("150") -> 1.5 | |
""" | |
# 1) If string ends with '%', parse percentage | |
if isinstance(value, str): | |
value = value.strip() | |
if value.endswith("%"): | |
numeric_str = value[:-1].strip() # remove '%' suffix | |
try: | |
f = float(numeric_str) | |
except ValueError: | |
print( | |
f"Invalid speed format '{value}'. Falling back to default speed 1.0." | |
) | |
f = 100.0 # fallback to "100%" -> 1.0 | |
speed = f / 100.0 | |
else: | |
# It's a normal string; parse as float | |
try: | |
f = float(value) | |
except ValueError: | |
print( | |
f"Invalid speed format '{value}'. Falling back to default speed 1.0." | |
) | |
f = 100.0 # fallback to "100" -> 1.0 | |
# If f >= 10, treat as f/100. Example: 50 -> 0.5, 150 -> 1.5 | |
speed = f / 100.0 if f >= 10 else f | |
else: | |
# 2) If not string, parse as float | |
try: | |
f = float(value) | |
except ValueError: | |
print(f"Invalid speed value '{value}'. Falling back to default speed 1.0.") | |
f = 1.0 # fallback to 1.0 | |
# If f >= 10, treat as f/100 | |
speed = f / 100.0 if f >= 10 else f | |
# 3) Clamp to [0.5, 2.0] | |
clamped_speed = max(0.5, min(2.0, speed)) | |
if clamped_speed != speed: | |
print(f"Speed {speed} clamped to {clamped_speed}.") | |
else: | |
print(f"Parsed speed: {clamped_speed}") | |
return clamped_speed | |
def concatenate_audios(audios: List[np.ndarray]) -> np.ndarray: | |
""" | |
Concatenate a list of NumPy audio arrays into a single array. | |
Args: | |
audios (List[np.ndarray]): List of audio waveforms to concatenate. | |
Returns: | |
np.ndarray: Concatenated audio waveform. | |
""" | |
return np.concatenate(audios, axis=0) | |
############################################################################## | |
# SYNTHESIS CORE FUNCTION | |
############################################################################## | |
def synthesize_audio( | |
text_chunks: List[str], | |
style_vec: torch.Tensor, | |
speed: float, | |
alpha: float = 0.3, | |
beta: float = 0.7, | |
diffusion_steps: int = 7, | |
embedding_scale: float = 1.0, | |
) -> Optional[np.ndarray]: | |
""" | |
Core function to synthesize audio from text chunks and a style vector. | |
Args: | |
text_chunks (List[str]): List of text segments to synthesize. | |
style_vec (torch.Tensor): Style vector tensor of shape (1, D). | |
speed (float): Parsed speed factor. | |
alpha (float): Alpha parameter for inference. | |
beta (float): Beta parameter for inference. | |
diffusion_steps (int): Number of diffusion steps for inference. | |
embedding_scale (float): Embedding scale parameter. | |
Returns: | |
Optional[np.ndarray]: Concatenated audio waveform, or None if synthesis fails. | |
""" | |
audios = [] | |
for idx, chunk in enumerate(text_chunks, 1): | |
print(f"Synthesizing chunk {idx}/{len(text_chunks)}...") | |
audio_segment = inference.inference( | |
chunk, | |
style_vec, | |
alpha=alpha, | |
beta=beta, | |
diffusion_steps=diffusion_steps, | |
embedding_scale=embedding_scale, | |
speed=speed, | |
) | |
if audio_segment is not None: | |
audios.append(audio_segment) | |
print(f"Chunk {idx} synthesized successfully.") | |
else: | |
print(f"Inference returned None for text segment {idx}: {chunk[:30]}...") | |
if not audios: | |
print("No audio segments were generated.") | |
return None | |
# Concatenate all audio segments | |
print("Concatenating audio segments...") | |
full_audio = concatenate_audios(audios) | |
print(f"Concatenated audio length: {len(full_audio)} samples.") | |
return full_audio | |
############################################################################## | |
# TTS USING A RANDOMLY SAMPLED STYLE | |
############################################################################## | |
def tts_randomized( | |
text: str, speed: float = 1.2 | |
) -> Tuple[Optional[np.ndarray], Optional[torch.Tensor]]: | |
""" | |
1) Loads style vectors from voices.json | |
2) Fits a Gaussian to those vectors | |
3) Samples a new style vector from that distribution | |
4) Saves it in random_voices.json | |
5) Synthesizes TTS using that random style, handling long texts. | |
Args: | |
text (str): The text to be synthesized. | |
speed (float): Speed of the generated audio. | |
Returns: | |
Tuple[Optional[np.ndarray], Optional[torch.Tensor]]: (audio_waveform, style_vector) | |
""" | |
# Load known style vectors from voices.json | |
voices_data = load_json(VOICES_JSON_PATH) | |
if not voices_data: | |
print(f"No data found in '{VOICES_JSON_PATH}'; cannot sample a random style.") | |
return None, None | |
# Fit Gaussian | |
try: | |
mean, cov = fit_gaussian_to_voices(voices_data) | |
except ValueError as e: | |
print(f"Error fitting Gaussian: {e}") | |
return None, None | |
# Sample new vector | |
random_style_tensor = sample_random_style(mean, cov) | |
# Optionally create a random key for storing | |
random_key = "random_" + "".join(random.choices(string.digits, k=6)) | |
print(f"Generated random style key: '{random_key}'") | |
# Save in random_voices.json | |
random_voices_data = load_json(RANDOM_VOICES_JSON_PATH) | |
random_voices_data[random_key] = random_style_tensor.squeeze(0).tolist() | |
save_json(random_voices_data, RANDOM_VOICES_JSON_PATH) | |
print( | |
f"Saved random style vector to '{RANDOM_VOICES_JSON_PATH}' under key '{random_key}'." | |
) | |
# Parse speed | |
speed = parse_speed(speed) | |
# Split text into manageable chunks using txtsplit | |
print("Splitting text into chunks...") | |
text_chunks = txtsplit(text) | |
print(f"Text split into {len(text_chunks)} chunks.") | |
# Synthesize audio using the core function | |
full_audio = synthesize_audio( | |
text_chunks=text_chunks, style_vec=random_style_tensor, speed=speed | |
) | |
return full_audio, random_style_tensor | |
############################################################################## | |
# NORMAL (NON-RANDOM) TTS LOGIC | |
############################################################################## | |
def get_or_compute_style_vector(key_or_path: str, voices_data: dict) -> torch.Tensor: | |
""" | |
If key_or_path is in voices_data, load it. | |
If it's a file path, compute style from audio. | |
Otherwise, raise an error. | |
Args: | |
key_or_path (str): Voice key or file path. | |
voices_data (dict): Dictionary of existing style vectors. | |
Returns: | |
torch.Tensor: Style vector tensor of shape (1, D). | |
""" | |
if key_or_path in voices_data: | |
print(f"Found style vector for '{key_or_path}' in '{VOICES_JSON_PATH}'.") | |
style_vec = torch.tensor(voices_data[key_or_path], dtype=torch.float32).to( | |
device | |
) # Move to device | |
elif os.path.isfile(key_or_path): | |
print( | |
f"No existing style for '{key_or_path}'. Attempting to compute from audio..." | |
) | |
style_vec = inference.compute_style(key_or_path) | |
if style_vec is None: | |
raise ValueError(f"Failed to compute style vector from '{key_or_path}'.") | |
style_vec = style_vec.to(device) # Move to device | |
voices_data[key_or_path] = style_vec.squeeze(0).tolist() | |
save_json(voices_data, VOICES_JSON_PATH) | |
print( | |
f"Computed and saved new style vector for '{key_or_path}' to '{VOICES_JSON_PATH}'." | |
) | |
else: | |
raise ValueError( | |
f"'{key_or_path}' not found in '{VOICES_JSON_PATH}' and is not a valid file path." | |
) | |
print(f"Original style vector shape: {style_vec.shape}") | |
# Ensure style_vec is 2D: (1, D) | |
if style_vec.dim() == 1: | |
style_vec = style_vec.unsqueeze(0) | |
style_vec = style_vec.to(device) # Ensure it's on the correct device | |
print(f"Unsqueezed style vector to shape: {style_vec.shape}") | |
elif style_vec.dim() == 3: | |
style_vec = style_vec.squeeze(1).to(device) | |
print(f"Squeezed style vector to shape: {style_vec.shape}") | |
elif style_vec.dim() != 2: | |
raise ValueError( | |
f"Unexpected style vector dimensions: {style_vec.shape}. Expected 2D tensor." | |
) | |
print(f"Processed style vector shape: {style_vec.shape}") | |
return style_vec | |
def validate_style_vectors(voices_data: dict): | |
""" | |
Validates that all style vectors in voices_data have the same dimensionality. | |
Args: | |
voices_data (dict): Dictionary containing style vectors. | |
Raises: | |
ValueError: If inconsistent vector lengths are found. | |
""" | |
if not voices_data: | |
print("No style vectors to validate.") | |
return | |
lengths = set(len(vec) for vec in voices_data.values()) | |
if len(lengths) > 1: | |
raise ValueError( | |
f"Inconsistent style vector lengths found: {lengths}. " | |
"All style vectors must have the same dimensionality." | |
) | |
print("All style vectors have consistent lengths.") | |
def tts_normal(text: str, voice: str, speed: float = 1.2) -> Optional[np.ndarray]: | |
""" | |
Load an existing style vector from voices.json if it exists and has 'voice'. | |
Otherwise, if 'voice' is a valid .wav file, compute its style vector | |
and store it. Finally, run normal TTS with the obtained style vector, | |
handling long texts. | |
Args: | |
text (str): The text to be synthesized. | |
voice (str): Either the key in voices.json or a .wav file path. | |
speed (float): Speed of the generated audio. | |
Returns: | |
Optional[np.ndarray]: Synthesized audio waveform, or None if something fails. | |
""" | |
# Load voices_data | |
try: | |
voices_data = load_json(VOICES_JSON_PATH) | |
validate_style_vectors(voices_data) | |
except ValueError as e: | |
print(f"Error loading/validating '{VOICES_JSON_PATH}': {e}") | |
return None | |
try: | |
style_vec = get_or_compute_style_vector(voice, voices_data) | |
except ValueError as e: | |
print(e) | |
return None | |
if style_vec is None: | |
print("No style vector found or computed; cannot run TTS.") | |
return None | |
# Parse speed | |
speed = parse_speed(speed) | |
# Split text into manageable chunks using txtsplit | |
print("Splitting text into chunks...") | |
text_chunks = txtsplit(text) | |
print(f"Text split into {len(text_chunks)} chunks.") | |
# Synthesize audio using the core function | |
full_audio = synthesize_audio( | |
text_chunks=text_chunks, | |
style_vec=style_vec, | |
speed=speed, | |
) | |
return full_audio | |
############################################################################## | |
# TTS USING A DIRECTLY PROVIDED STYLE VECTOR | |
############################################################################## | |
def tts_with_style_vector( | |
text: str, | |
style_vec: torch.Tensor, | |
speed: float = 1.2, | |
alpha: float = 0.3, | |
beta: float = 0.7, | |
diffusion_steps: int = 7, | |
embedding_scale: float = 1.0, | |
) -> Optional[np.ndarray]: | |
""" | |
Perform TTS synthesis using a *directly provided* style vector. | |
Args: | |
text (str): The text to be spoken. | |
style_vec (torch.Tensor): A PyTorch tensor representing the style vector. | |
Should be shape (1, D) if the pipeline expects a batch dimension. | |
speed (float): Speed factor for TTS. (Use parse_speed to handle fancy inputs.) | |
alpha (float): Weight for alpha in your inference function. | |
beta (float): Weight for beta in your inference function. | |
diffusion_steps (int): Number of diffusion steps for your TTS pipeline. | |
embedding_scale (float): Classifier-free guidance scale or similar. | |
Returns: | |
Optional[np.ndarray]: Synthesized audio waveform as a NumPy array (float32), or None if synthesis fails. | |
""" | |
# Ensure style_vec has shape (1, D) | |
if style_vec.dim() == 1: | |
style_vec = style_vec.unsqueeze(0) # e.g. (D,) -> (1, D) | |
style_vec = style_vec.to(device) # Move to device | |
print(f"Unsqueezed style vector to shape: {style_vec.shape}") | |
elif style_vec.dim() == 3: | |
style_vec = style_vec.squeeze(1).to(device) | |
print(f"Squeezed style vector to shape: {style_vec.shape}") | |
elif style_vec.dim() != 2: | |
print(f"Unexpected style vector shape: {style_vec.shape}. Expected 2D tensor.") | |
return None | |
print(f"Style vector shape for synthesis: {style_vec.shape}") | |
# Parse speed | |
speed_val = parse_speed(speed) | |
# Split text into manageable chunks using txtsplit | |
print("Splitting text into chunks...") | |
text_chunks = txtsplit(text) | |
print(f"Text split into {len(text_chunks)} chunks.") | |
# Synthesize audio using the core function | |
full_audio = synthesize_audio( | |
text_chunks=text_chunks, | |
style_vec=style_vec, | |
speed=speed_val, | |
alpha=alpha, | |
beta=beta, | |
diffusion_steps=diffusion_steps, | |
embedding_scale=embedding_scale, | |
) | |
return full_audio | |
############################################################################## | |
# MAIN CLI | |
############################################################################## | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Script to TTS with either random style sampling or normal style usage." | |
) | |
parser.add_argument( | |
"--text", | |
type=str, | |
default="Hello from a random style or normal style TTS script!", | |
help="Text to be spoken.", | |
) | |
parser.add_argument( | |
"--speed", | |
type=str, # Changed to str to handle inputs like "120%" | |
default="1.2", | |
help="Speed of the generated audio (e.g., '120%', '1.2').", | |
) | |
parser.add_argument( | |
"--voice", | |
type=str, | |
default=None, | |
help="If not using --randomize, specify a voice key or .wav path to load/compute style.", | |
) | |
parser.add_argument( | |
"--randomize", | |
action="store_true", | |
help="Use random style sampling from a fitted Gaussian of known styles.", | |
) | |
parser.add_argument( | |
"--output", type=str, default="output.wav", help="Output WAV file name." | |
) | |
args = parser.parse_args() | |
if args.randomize: | |
# Approach: random style from distribution | |
print("Sampling a new random style vector from 'voices.json' distribution...") | |
audio, _ = tts_randomized(text=args.text, speed=args.speed) | |
else: | |
# Normal approach: use a style key or fallback | |
print("Using normal style approach (loading or computing from 'voices.json').") | |
if args.voice is None: | |
print("Error: --voice must be specified when not using --randomize.") | |
parser.print_help() | |
return | |
audio = tts_normal(text=args.text, voice=args.voice, speed=args.speed) | |
if audio is not None: | |
# Ensure audio is a NumPy array of type float32 | |
if not isinstance(audio, np.ndarray): | |
print("Error: Synthesized audio is not a NumPy array.") | |
return | |
if audio.dtype != np.float32: | |
print(f"Converting audio from {audio.dtype} to float32.") | |
audio = audio.astype(np.float32) | |
# Save the concatenated audio | |
try: | |
sf.write(args.output, audio, 24000) | |
print(f"Audio saved to '{args.output}'.") | |
except Exception as e: | |
print(f"Failed to save audio to '{args.output}': {e}") | |
else: | |
print("No audio was generated. Check logs above for errors.") | |
if __name__ == "__main__": | |
main() | |