tts / app.py
ChandimaPrabath's picture
v0.1
d690791
import logging
import wave
import numpy as np
from io import BytesIO
from flask import Flask, request, send_file, jsonify
from flask_cors import CORS
from huggingface_hub import hf_hub_download
from piper import PiperVoice
app = Flask(__name__)
CORS(app)
# Setup logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# Available models list
available_models = [
{"repo_id": "csukuangfj/vits-piper-en_US-lessac-medium", "filename": "en_US-lessac-medium.onnx"},
{"repo_id": "csukuangfj/vits-piper-en_US-hfc_female-medium", "filename": "en_US-hfc_female-medium.onnx"},
{"repo_id": "csukuangfj/vits-piper-en_GB-southern_english_female-medium", "filename": "en_GB-southern_english_female-medium.onnx"}
]
def synthesize_speech(repo_id, model_filename, text, sentence_silence, length_scale):
logger.debug("Downloading model and config files...")
model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
config_path = hf_hub_download(repo_id=repo_id, filename=f"{model_filename}.json")
logger.debug("Loading PiperVoice model...")
voice = PiperVoice.load(model_path, config_path)
buffer = BytesIO()
logger.debug("Synthesizing speech...")
with wave.open(buffer, 'wb') as wav_file:
wav_file.setframerate(voice.config.sample_rate)
wav_file.setsampwidth(2)
wav_file.setnchannels(1)
voice.synthesize(text, wav_file, sentence_silence=sentence_silence, length_scale=length_scale)
buffer.seek(0)
logger.debug("Speech synthesis complete.")
return buffer
@app.route('/')
def index():
return '''
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>TTS Server</title>
<style>
body { font-family: Arial, sans-serif; margin: 40px; }
h1 { color: #333; }
p { font-size: 1.2em; }
code { background: #f4f4f4; padding: 2px 4px; border-radius: 4px; }
</style>
</head>
<body>
<h1>TTS Server is Running</h1>
<p>Use the <code>/tts</code> endpoint to synthesize speech.</p>
<p>Send a POST request with JSON data containing the <code>model</code>, <code>text</code>, <code>sentence_silence</code>, and <code>length_scale</code> parameters.</p>
</body>
</html>
'''
@app.route('/models')
def models():
return jsonify(available_models)
@app.route('/tts', methods=['POST'])
def tts():
data = request.json
if not data:
logger.error("No data received in request.")
return jsonify({"error": "No data provided"}), 400
model = data.get('model', '')
text = data.get('text', '')
sentence_silence = float(data.get('sentence_silence', 0.1))
length_scale = float(data.get('length_scale', 1.0))
if not model:
logger.error("No model provided in request.")
return jsonify({"error": "Model parameter is required"}), 400
if not text:
logger.error("No text provided in request.")
return jsonify({"error": "Text parameter is required"}), 400
# Find the model in the available models list
selected_model = next((m for m in available_models if m["filename"] == model), None)
if not selected_model:
logger.error(f"Model {model} not found.")
return jsonify({"error": f"Model {model} not found"}), 404
logger.info(f"Received request: model={model}, text={text}, sentence_silence={sentence_silence}, length_scale={length_scale}")
try:
audio_buffer = synthesize_speech(selected_model['repo_id'], selected_model['filename'], text, sentence_silence, length_scale)
except Exception as e:
logger.exception("Error during speech synthesis.")
return jsonify({"error": str(e)}), 500
return send_file(audio_buffer, mimetype="audio/wav", as_attachment=True, download_name="output.wav")
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=7860)