from transformers import ParlerTTSForConditionalGeneration, AutoTokenizer, Trainer, TrainingArguments from datasets import load_dataset # Download model model_name = "parler-tts/parler-tts-mini-v1" model = ParlerTTSForConditionalGeneration.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Load dataset (replace with your dataset) dataset = load_dataset("lj_speech") # Example dataset; adjust as needed # Preprocess function (customize based on your dataset) def preprocess_function(examples): # Tokenize text and prepare audio (example; adjust for your data) inputs = tokenizer(examples["text"], return_tensors="pt", padding=True, truncation=True) # Add audio processing if needed return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]} train_dataset = dataset["train"].map(preprocess_function, batched=True) # Training arguments training_args = TrainingArguments( output_dir="./tts_finetuned", per_device_train_batch_size=8, num_train_epochs=3, save_steps=500, logging_steps=10, ) # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, ) # Fine-tune trainer.train() # Save fine-tuned model trainer.save_model("./tts_finetuned") tokenizer.save_pretrained("./tts_finetuned") print("TTS model fine-tuned and saved to './tts_finetuned'. Upload to models/tts_model in your Space.")