File size: 4,032 Bytes
fad2e64
c6169e1
 
 
fad2e64
c6169e1
 
 
 
 
 
 
fad2e64
 
 
 
d690791
 
 
 
 
 
 
 
fad2e64
d690791
 
c6169e1
fad2e64
c6169e1
 
 
fad2e64
c6169e1
 
 
 
 
 
 
fad2e64
c6169e1
 
 
 
fad2e64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d690791
fad2e64
 
 
c6169e1
d690791
 
 
 
c6169e1
 
 
fad2e64
 
 
 
d690791
c6169e1
 
 
 
d690791
 
 
 
fad2e64
 
 
 
d690791
 
 
 
 
 
 
fad2e64
d690791
fad2e64
 
 
c6169e1
 
 
 
fad2e64
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import logging
import wave
import numpy as np
from io import BytesIO
from flask import Flask, request, send_file, jsonify
from flask_cors import CORS
from huggingface_hub import hf_hub_download
from piper import PiperVoice

app = Flask(__name__)
CORS(app)

# Setup logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Available models list
available_models = [
    {"repo_id": "csukuangfj/vits-piper-en_US-lessac-medium", "filename": "en_US-lessac-medium.onnx"},
    {"repo_id": "csukuangfj/vits-piper-en_US-hfc_female-medium", "filename": "en_US-hfc_female-medium.onnx"},
    {"repo_id": "csukuangfj/vits-piper-en_GB-southern_english_female-medium", "filename": "en_GB-southern_english_female-medium.onnx"}
]

def synthesize_speech(repo_id, model_filename, text, sentence_silence, length_scale):
    logger.debug("Downloading model and config files...")
    model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
    config_path = hf_hub_download(repo_id=repo_id, filename=f"{model_filename}.json")

    logger.debug("Loading PiperVoice model...")
    voice = PiperVoice.load(model_path, config_path)

    buffer = BytesIO()
    logger.debug("Synthesizing speech...")
    with wave.open(buffer, 'wb') as wav_file:
        wav_file.setframerate(voice.config.sample_rate)
        wav_file.setsampwidth(2)
        wav_file.setnchannels(1)
        voice.synthesize(text, wav_file, sentence_silence=sentence_silence, length_scale=length_scale)

    buffer.seek(0)
    logger.debug("Speech synthesis complete.")
    return buffer

@app.route('/')
def index():
    return '''
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>TTS Server</title>
        <style>
            body { font-family: Arial, sans-serif; margin: 40px; }
            h1 { color: #333; }
            p { font-size: 1.2em; }
            code { background: #f4f4f4; padding: 2px 4px; border-radius: 4px; }
        </style>
    </head>
    <body>
        <h1>TTS Server is Running</h1>
        <p>Use the <code>/tts</code> endpoint to synthesize speech.</p>
        <p>Send a POST request with JSON data containing the <code>model</code>, <code>text</code>, <code>sentence_silence</code>, and <code>length_scale</code> parameters.</p>
    </body>
    </html>
    '''

@app.route('/models')
def models():
    return jsonify(available_models)

@app.route('/tts', methods=['POST'])
def tts():
    data = request.json
    if not data:
        logger.error("No data received in request.")
        return jsonify({"error": "No data provided"}), 400

    model = data.get('model', '')
    text = data.get('text', '')
    sentence_silence = float(data.get('sentence_silence', 0.1))
    length_scale = float(data.get('length_scale', 1.0))

    if not model:
        logger.error("No model provided in request.")
        return jsonify({"error": "Model parameter is required"}), 400

    if not text:
        logger.error("No text provided in request.")
        return jsonify({"error": "Text parameter is required"}), 400

    # Find the model in the available models list
    selected_model = next((m for m in available_models if m["filename"] == model), None)
    if not selected_model:
        logger.error(f"Model {model} not found.")
        return jsonify({"error": f"Model {model} not found"}), 404

    logger.info(f"Received request: model={model}, text={text}, sentence_silence={sentence_silence}, length_scale={length_scale}")
    try:
        audio_buffer = synthesize_speech(selected_model['repo_id'], selected_model['filename'], text, sentence_silence, length_scale)
    except Exception as e:
        logger.exception("Error during speech synthesis.")
        return jsonify({"error": str(e)}), 500

    return send_file(audio_buffer, mimetype="audio/wav", as_attachment=True, download_name="output.wav")

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=7860)