import logging import wave import numpy as np from io import BytesIO from flask import Flask, request, send_file, jsonify from flask_cors import CORS from huggingface_hub import hf_hub_download from piper import PiperVoice app = Flask(__name__) CORS(app) # Setup logging logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) # Available models list available_models = [ {"repo_id": "csukuangfj/vits-piper-en_US-lessac-medium", "filename": "en_US-lessac-medium.onnx"}, {"repo_id": "csukuangfj/vits-piper-en_US-hfc_female-medium", "filename": "en_US-hfc_female-medium.onnx"}, {"repo_id": "csukuangfj/vits-piper-en_GB-southern_english_female-medium", "filename": "en_GB-southern_english_female-medium.onnx"} ] def synthesize_speech(repo_id, model_filename, text, sentence_silence, length_scale): logger.debug("Downloading model and config files...") model_path = hf_hub_download(repo_id=repo_id, filename=model_filename) config_path = hf_hub_download(repo_id=repo_id, filename=f"{model_filename}.json") logger.debug("Loading PiperVoice model...") voice = PiperVoice.load(model_path, config_path) buffer = BytesIO() logger.debug("Synthesizing speech...") with wave.open(buffer, 'wb') as wav_file: wav_file.setframerate(voice.config.sample_rate) wav_file.setsampwidth(2) wav_file.setnchannels(1) voice.synthesize(text, wav_file, sentence_silence=sentence_silence, length_scale=length_scale) buffer.seek(0) logger.debug("Speech synthesis complete.") return buffer @app.route('/') def index(): return ''' TTS Server

TTS Server is Running

Use the /tts endpoint to synthesize speech.

Send a POST request with JSON data containing the model, text, sentence_silence, and length_scale parameters.

''' @app.route('/models') def models(): return jsonify(available_models) @app.route('/tts', methods=['POST']) def tts(): data = request.json if not data: logger.error("No data received in request.") return jsonify({"error": "No data provided"}), 400 model = data.get('model', '') text = data.get('text', '') sentence_silence = float(data.get('sentence_silence', 0.1)) length_scale = float(data.get('length_scale', 1.0)) if not model: logger.error("No model provided in request.") return jsonify({"error": "Model parameter is required"}), 400 if not text: logger.error("No text provided in request.") return jsonify({"error": "Text parameter is required"}), 400 # Find the model in the available models list selected_model = next((m for m in available_models if m["filename"] == model), None) if not selected_model: logger.error(f"Model {model} not found.") return jsonify({"error": f"Model {model} not found"}), 404 logger.info(f"Received request: model={model}, text={text}, sentence_silence={sentence_silence}, length_scale={length_scale}") try: audio_buffer = synthesize_speech(selected_model['repo_id'], selected_model['filename'], text, sentence_silence, length_scale) except Exception as e: logger.exception("Error during speech synthesis.") return jsonify({"error": str(e)}), 500 return send_file(audio_buffer, mimetype="audio/wav", as_attachment=True, download_name="output.wav") if __name__ == '__main__': app.run(debug=True, host='0.0.0.0', port=7860)