Spaces:

Wismut
/

StyleTTS2_Studio

Running

App Files Files Community

StyleTTS2_Studio / text2speech.py

Wismut

fixed missing cuda option

34ab4db 7 months ago

raw

history blame contribute delete

21.2 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	import json
	import os
	import argparse
	import random
	import string

	import numpy as np
	import soundfile as sf # Alias for clarity
	import torch

	import inference
	from txtsplit import txtsplit # Import txtsplit
	from typing import Optional, Tuple, List

	VOICES_JSON_PATH = "voices.json" # Contains your known style vectors
	RANDOM_VOICES_JSON_PATH = "random_voices.json" # We'll store newly sampled vectors here

	##############################################################################
	# DEVICE CONFIGURATION
	##############################################################################
	# Detect if CUDA is available and set the device accordingly
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")


	##############################################################################
	# JSON LOAD/SAVE
	##############################################################################
	def load_json(path: str) -> dict:
	"""
	Load existing style vectors from the given JSON file.

	Additionally, validates that all style vectors have the same length.

	Args:
	path (str): Path to the JSON file.

	Returns:
	dict: Loaded JSON data.
	"""
	data = {}
	if os.path.exists(path):
	with open(path, "r") as f:
	data = json.load(f)
	# Verify all vectors have the same length
	lengths = set(len(vec) for vec in data.values())
	if len(lengths) > 1:
	raise ValueError(
	f"Inconsistent vector lengths found in '{path}': {lengths}. "
	"All style vectors must have the same dimensionality."
	)
	print(f"Loaded {len(data)} style vectors from '{path}'.")
	else:
	print(f"No existing '{path}' found. Starting with an empty dictionary.")
	return data


	def save_json(data: dict, path: str) -> None:
	"""
	Save a dict of style vectors to the given JSON file.

	Args:
	data (dict): Data to save.
	path (str): Path to the JSON file.
	"""
	with open(path, "w") as f:
	json.dump(data, f, indent=2)
	print(f"Saved {len(data)} style vectors to '{path}'.")


	##############################################################################
	# GAUSSIAN FIT AND SAMPLING
	##############################################################################


	def fit_gaussian_to_voices(voices_data: dict) -> Tuple[np.ndarray, np.ndarray]:
	"""
	Fit a Gaussian distribution (mean & cov) to the style vectors in 'voices_data'.
	'voices_data' is a dict: { "key.wav": <list-of-floats>, ... }

	Args:
	voices_data (dict): Dictionary containing style vectors.

	Returns:
	Tuple[np.ndarray, np.ndarray]: Mean and covariance of the fitted Gaussian.
	"""
	all_vecs = []

	for key, data in voices_data.items():
	# Convert to array
	arr = np.array(data, dtype=np.float32)
	# Squeeze out any dimension of size 1
	arr = np.squeeze(arr)

	if arr.ndim == 1:
	# It's shape (D,)
	all_vecs.append(arr)
	else:
	# If still not 1D, we skip or warn
	print(
	f"Skipping '{key}' because shape is {arr.shape}, not 1D after squeeze."
	)

	# Must have at least 2 valid vectors to compute a meaningful covariance
	if len(all_vecs) < 2:
	raise ValueError(
	"Need at least 2 valid style vectors to fit a Gaussian distribution.\n"
	"Check that each entry is 1D (or (1,D) which can be squeezed)."
	)

	# Stack into (N, D)
	mat = np.stack(all_vecs, axis=0) # shape => (N, D)
	# Sanity check
	if mat.ndim != 2:
	raise ValueError("Style vectors must collectively form a 2D array (N, D).")

	# Compute mean & covariance
	mean = np.mean(mat, axis=0) # shape (D,)
	cov = np.cov(mat, rowvar=False) # shape (D, D)
	print("Fitted Gaussian distribution to style vectors.")
	return mean, cov


	def sample_random_style(mean: np.ndarray, cov: np.ndarray) -> torch.Tensor:
	"""
	Sample a random style vector from a Gaussian distribution.

	Args:
	mean (np.ndarray): Mean vector of the Gaussian.
	cov (np.ndarray): Covariance matrix of the Gaussian.

	Returns:
	torch.Tensor: Sampled style vector as a tensor of shape (1, D).
	"""
	# Sample from multivariate normal distribution
	z = np.random.multivariate_normal(mean, cov)
	# Convert to torch tensor
	style_tensor = torch.tensor(z, dtype=torch.float32).to(device) # Move to device
	# Unsqueeze to shape (1, D)
	style_tensor = style_tensor.unsqueeze(0)
	print(f"Sampled a new random style vector with shape {style_tensor.shape}.")
	return style_tensor


	##############################################################################
	# UTILITIES
	##############################################################################


	def parse_speed(value) -> float:
	"""
	Convert 'value' into a float between 0.5 and 2.0 based on custom logic.

	Examples:
	parse_speed("120%") -> 1.2
	parse_speed(0.3) -> 0.5 (clamped)
	parse_speed(5) -> 2.0 (clamped)
	parse_speed("100%") -> 1.0
	parse_speed(1) -> 1.0
	parse_speed(3) -> 2.0 (clamped)
	parse_speed(50) -> 0.5
	parse_speed(100) -> 1.0
	parse_speed(130) -> 1.3
	parse_speed("150") -> 1.5
	"""

	# 1) If string ends with '%', parse percentage
	if isinstance(value, str):
	value = value.strip()
	if value.endswith("%"):
	numeric_str = value[:-1].strip() # remove '%' suffix
	try:
	f = float(numeric_str)
	except ValueError:
	print(
	f"Invalid speed format '{value}'. Falling back to default speed 1.0."
	)
	f = 100.0 # fallback to "100%" -> 1.0
	speed = f / 100.0
	else:
	# It's a normal string; parse as float
	try:
	f = float(value)
	except ValueError:
	print(
	f"Invalid speed format '{value}'. Falling back to default speed 1.0."
	)
	f = 100.0 # fallback to "100" -> 1.0
	# If f >= 10, treat as f/100. Example: 50 -> 0.5, 150 -> 1.5
	speed = f / 100.0 if f >= 10 else f
	else:
	# 2) If not string, parse as float
	try:
	f = float(value)
	except ValueError:
	print(f"Invalid speed value '{value}'. Falling back to default speed 1.0.")
	f = 1.0 # fallback to 1.0
	# If f >= 10, treat as f/100
	speed = f / 100.0 if f >= 10 else f

	# 3) Clamp to [0.5, 2.0]
	clamped_speed = max(0.5, min(2.0, speed))
	if clamped_speed != speed:
	print(f"Speed {speed} clamped to {clamped_speed}.")
	else:
	print(f"Parsed speed: {clamped_speed}")
	return clamped_speed


	def concatenate_audios(audios: List[np.ndarray]) -> np.ndarray:
	"""
	Concatenate a list of NumPy audio arrays into a single array.

	Args:
	audios (List[np.ndarray]): List of audio waveforms to concatenate.

	Returns:
	np.ndarray: Concatenated audio waveform.
	"""
	return np.concatenate(audios, axis=0)


	##############################################################################
	# SYNTHESIS CORE FUNCTION
	##############################################################################
	def synthesize_audio(
	text_chunks: List[str],
	style_vec: torch.Tensor,
	speed: float,
	alpha: float = 0.3,
	beta: float = 0.7,
	diffusion_steps: int = 7,
	embedding_scale: float = 1.0,
	) -> Optional[np.ndarray]:
	"""
	Core function to synthesize audio from text chunks and a style vector.

	Args:
	text_chunks (List[str]): List of text segments to synthesize.
	style_vec (torch.Tensor): Style vector tensor of shape (1, D).
	speed (float): Parsed speed factor.
	alpha (float): Alpha parameter for inference.
	beta (float): Beta parameter for inference.
	diffusion_steps (int): Number of diffusion steps for inference.
	embedding_scale (float): Embedding scale parameter.

	Returns:
	Optional[np.ndarray]: Concatenated audio waveform, or None if synthesis fails.
	"""
	audios = []
	for idx, chunk in enumerate(text_chunks, 1):
	print(f"Synthesizing chunk {idx}/{len(text_chunks)}...")
	audio_segment = inference.inference(
	chunk,
	style_vec,
	alpha=alpha,
	beta=beta,
	diffusion_steps=diffusion_steps,
	embedding_scale=embedding_scale,
	speed=speed,
	)
	if audio_segment is not None:
	audios.append(audio_segment)
	print(f"Chunk {idx} synthesized successfully.")
	else:
	print(f"Inference returned None for text segment {idx}: {chunk[:30]}...")

	if not audios:
	print("No audio segments were generated.")
	return None

	# Concatenate all audio segments
	print("Concatenating audio segments...")
	full_audio = concatenate_audios(audios)
	print(f"Concatenated audio length: {len(full_audio)} samples.")
	return full_audio


	##############################################################################
	# TTS USING A RANDOMLY SAMPLED STYLE
	##############################################################################
	def tts_randomized(
	text: str, speed: float = 1.2
	) -> Tuple[Optional[np.ndarray], Optional[torch.Tensor]]:
	"""
	1) Loads style vectors from voices.json
	2) Fits a Gaussian to those vectors
	3) Samples a new style vector from that distribution
	4) Saves it in random_voices.json
	5) Synthesizes TTS using that random style, handling long texts.

	Args:
	text (str): The text to be synthesized.
	speed (float): Speed of the generated audio.

	Returns:
	Tuple[Optional[np.ndarray], Optional[torch.Tensor]]: (audio_waveform, style_vector)
	"""
	# Load known style vectors from voices.json
	voices_data = load_json(VOICES_JSON_PATH)
	if not voices_data:
	print(f"No data found in '{VOICES_JSON_PATH}'; cannot sample a random style.")
	return None, None

	# Fit Gaussian
	try:
	mean, cov = fit_gaussian_to_voices(voices_data)
	except ValueError as e:
	print(f"Error fitting Gaussian: {e}")
	return None, None

	# Sample new vector
	random_style_tensor = sample_random_style(mean, cov)

	# Optionally create a random key for storing
	random_key = "random_" + "".join(random.choices(string.digits, k=6))
	print(f"Generated random style key: '{random_key}'")

	# Save in random_voices.json
	random_voices_data = load_json(RANDOM_VOICES_JSON_PATH)
	random_voices_data[random_key] = random_style_tensor.squeeze(0).tolist()
	save_json(random_voices_data, RANDOM_VOICES_JSON_PATH)
	print(
	f"Saved random style vector to '{RANDOM_VOICES_JSON_PATH}' under key '{random_key}'."
	)

	# Parse speed
	speed = parse_speed(speed)

	# Split text into manageable chunks using txtsplit
	print("Splitting text into chunks...")
	text_chunks = txtsplit(text)
	print(f"Text split into {len(text_chunks)} chunks.")

	# Synthesize audio using the core function
	full_audio = synthesize_audio(
	text_chunks=text_chunks, style_vec=random_style_tensor, speed=speed
	)

	return full_audio, random_style_tensor


	##############################################################################
	# NORMAL (NON-RANDOM) TTS LOGIC
	##############################################################################
	def get_or_compute_style_vector(key_or_path: str, voices_data: dict) -> torch.Tensor:
	"""
	If key_or_path is in voices_data, load it.
	If it's a file path, compute style from audio.
	Otherwise, raise an error.

	Args:
	key_or_path (str): Voice key or file path.
	voices_data (dict): Dictionary of existing style vectors.

	Returns:
	torch.Tensor: Style vector tensor of shape (1, D).
	"""
	if key_or_path in voices_data:
	print(f"Found style vector for '{key_or_path}' in '{VOICES_JSON_PATH}'.")
	style_vec = torch.tensor(voices_data[key_or_path], dtype=torch.float32).to(
	device
	) # Move to device
	elif os.path.isfile(key_or_path):
	print(
	f"No existing style for '{key_or_path}'. Attempting to compute from audio..."
	)
	style_vec = inference.compute_style(key_or_path)
	if style_vec is None:
	raise ValueError(f"Failed to compute style vector from '{key_or_path}'.")
	style_vec = style_vec.to(device) # Move to device
	voices_data[key_or_path] = style_vec.squeeze(0).tolist()
	save_json(voices_data, VOICES_JSON_PATH)
	print(
	f"Computed and saved new style vector for '{key_or_path}' to '{VOICES_JSON_PATH}'."
	)
	else:
	raise ValueError(
	f"'{key_or_path}' not found in '{VOICES_JSON_PATH}' and is not a valid file path."
	)

	print(f"Original style vector shape: {style_vec.shape}")

	# Ensure style_vec is 2D: (1, D)
	if style_vec.dim() == 1:
	style_vec = style_vec.unsqueeze(0)
	style_vec = style_vec.to(device) # Ensure it's on the correct device
	print(f"Unsqueezed style vector to shape: {style_vec.shape}")
	elif style_vec.dim() == 3:
	style_vec = style_vec.squeeze(1).to(device)
	print(f"Squeezed style vector to shape: {style_vec.shape}")
	elif style_vec.dim() != 2:
	raise ValueError(
	f"Unexpected style vector dimensions: {style_vec.shape}. Expected 2D tensor."
	)

	print(f"Processed style vector shape: {style_vec.shape}")
	return style_vec


	def validate_style_vectors(voices_data: dict):
	"""
	Validates that all style vectors in voices_data have the same dimensionality.

	Args:
	voices_data (dict): Dictionary containing style vectors.

	Raises:
	ValueError: If inconsistent vector lengths are found.
	"""
	if not voices_data:
	print("No style vectors to validate.")
	return

	lengths = set(len(vec) for vec in voices_data.values())
	if len(lengths) > 1:
	raise ValueError(
	f"Inconsistent style vector lengths found: {lengths}. "
	"All style vectors must have the same dimensionality."
	)
	print("All style vectors have consistent lengths.")


	def tts_normal(text: str, voice: str, speed: float = 1.2) -> Optional[np.ndarray]:
	"""
	Load an existing style vector from voices.json if it exists and has 'voice'.
	Otherwise, if 'voice' is a valid .wav file, compute its style vector
	and store it. Finally, run normal TTS with the obtained style vector,
	handling long texts.

	Args:
	text (str): The text to be synthesized.
	voice (str): Either the key in voices.json or a .wav file path.
	speed (float): Speed of the generated audio.

	Returns:
	Optional[np.ndarray]: Synthesized audio waveform, or None if something fails.
	"""
	# Load voices_data
	try:
	voices_data = load_json(VOICES_JSON_PATH)
	validate_style_vectors(voices_data)
	except ValueError as e:
	print(f"Error loading/validating '{VOICES_JSON_PATH}': {e}")
	return None

	try:
	style_vec = get_or_compute_style_vector(voice, voices_data)
	except ValueError as e:
	print(e)
	return None

	if style_vec is None:
	print("No style vector found or computed; cannot run TTS.")
	return None

	# Parse speed
	speed = parse_speed(speed)

	# Split text into manageable chunks using txtsplit
	print("Splitting text into chunks...")
	text_chunks = txtsplit(text)
	print(f"Text split into {len(text_chunks)} chunks.")

	# Synthesize audio using the core function
	full_audio = synthesize_audio(
	text_chunks=text_chunks,
	style_vec=style_vec,
	speed=speed,
	)

	return full_audio


	##############################################################################
	# TTS USING A DIRECTLY PROVIDED STYLE VECTOR
	##############################################################################
	def tts_with_style_vector(
	text: str,
	style_vec: torch.Tensor,
	speed: float = 1.2,
	alpha: float = 0.3,
	beta: float = 0.7,
	diffusion_steps: int = 7,
	embedding_scale: float = 1.0,
	) -> Optional[np.ndarray]:
	"""
	Perform TTS synthesis using a directly provided style vector.

	Args:
	text (str): The text to be spoken.
	style_vec (torch.Tensor): A PyTorch tensor representing the style vector.
	Should be shape (1, D) if the pipeline expects a batch dimension.
	speed (float): Speed factor for TTS. (Use parse_speed to handle fancy inputs.)
	alpha (float): Weight for alpha in your inference function.
	beta (float): Weight for beta in your inference function.
	diffusion_steps (int): Number of diffusion steps for your TTS pipeline.
	embedding_scale (float): Classifier-free guidance scale or similar.

	Returns:
	Optional[np.ndarray]: Synthesized audio waveform as a NumPy array (float32), or None if synthesis fails.
	"""
	# Ensure style_vec has shape (1, D)
	if style_vec.dim() == 1:
	style_vec = style_vec.unsqueeze(0) # e.g. (D,) -> (1, D)
	style_vec = style_vec.to(device) # Move to device
	print(f"Unsqueezed style vector to shape: {style_vec.shape}")
	elif style_vec.dim() == 3:
	style_vec = style_vec.squeeze(1).to(device)
	print(f"Squeezed style vector to shape: {style_vec.shape}")
	elif style_vec.dim() != 2:
	print(f"Unexpected style vector shape: {style_vec.shape}. Expected 2D tensor.")
	return None

	print(f"Style vector shape for synthesis: {style_vec.shape}")

	# Parse speed
	speed_val = parse_speed(speed)

	# Split text into manageable chunks using txtsplit
	print("Splitting text into chunks...")
	text_chunks = txtsplit(text)
	print(f"Text split into {len(text_chunks)} chunks.")

	# Synthesize audio using the core function
	full_audio = synthesize_audio(
	text_chunks=text_chunks,
	style_vec=style_vec,
	speed=speed_val,
	alpha=alpha,
	beta=beta,
	diffusion_steps=diffusion_steps,
	embedding_scale=embedding_scale,
	)

	return full_audio


	##############################################################################
	# MAIN CLI
	##############################################################################
	def main():
	parser = argparse.ArgumentParser(
	description="Script to TTS with either random style sampling or normal style usage."
	)
	parser.add_argument(
	"--text",
	type=str,
	default="Hello from a random style or normal style TTS script!",
	help="Text to be spoken.",
	)
	parser.add_argument(
	"--speed",
	type=str, # Changed to str to handle inputs like "120%"
	default="1.2",
	help="Speed of the generated audio (e.g., '120%', '1.2').",
	)
	parser.add_argument(
	"--voice",
	type=str,
	default=None,
	help="If not using --randomize, specify a voice key or .wav path to load/compute style.",
	)
	parser.add_argument(
	"--randomize",
	action="store_true",
	help="Use random style sampling from a fitted Gaussian of known styles.",
	)
	parser.add_argument(
	"--output", type=str, default="output.wav", help="Output WAV file name."
	)
	args = parser.parse_args()

	if args.randomize:
	# Approach: random style from distribution
	print("Sampling a new random style vector from 'voices.json' distribution...")
	audio, _ = tts_randomized(text=args.text, speed=args.speed)
	else:
	# Normal approach: use a style key or fallback
	print("Using normal style approach (loading or computing from 'voices.json').")
	if args.voice is None:
	print("Error: --voice must be specified when not using --randomize.")
	parser.print_help()
	return
	audio = tts_normal(text=args.text, voice=args.voice, speed=args.speed)

	if audio is not None:
	# Ensure audio is a NumPy array of type float32
	if not isinstance(audio, np.ndarray):
	print("Error: Synthesized audio is not a NumPy array.")
	return
	if audio.dtype != np.float32:
	print(f"Converting audio from {audio.dtype} to float32.")
	audio = audio.astype(np.float32)

	# Save the concatenated audio
	try:
	sf.write(args.output, audio, 24000)
	print(f"Audio saved to '{args.output}'.")
	except Exception as e:
	print(f"Failed to save audio to '{args.output}': {e}")
	else:
	print("No audio was generated. Check logs above for errors.")


	if __name__ == "__main__":
	main()