File size: 3,078 Bytes
fad2e64
c6169e1
 
 
fad2e64
c6169e1
 
 
 
 
 
 
fad2e64
 
 
 
c6169e1
fad2e64
c6169e1
 
 
fad2e64
c6169e1
 
 
fad2e64
c6169e1
 
 
 
 
 
 
fad2e64
c6169e1
 
 
 
fad2e64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6169e1
 
 
 
fad2e64
 
 
 
c6169e1
 
 
 
fad2e64
 
 
 
 
 
 
 
 
 
c6169e1
 
 
 
fad2e64
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import logging
import wave
import numpy as np
from io import BytesIO
from flask import Flask, request, send_file, jsonify
from flask_cors import CORS
from huggingface_hub import hf_hub_download
from piper import PiperVoice

app = Flask(__name__)
CORS(app)

# Setup logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

def synthesize_speech(text, sentence_silence, length_scale):
    logger.debug("Downloading model and config files...")
    model_path = hf_hub_download(repo_id="csukuangfj/vits-piper-en_US-lessac-medium", filename="en_US-lessac-medium.onnx")
    config_path = hf_hub_download(repo_id="csukuangfj/vits-piper-en_US-lessac-medium", filename="en_US-lessac-medium.onnx.json")

    logger.debug("Loading PiperVoice model...")
    voice = PiperVoice.load(model_path, config_path)

    buffer = BytesIO()
    logger.debug("Synthesizing speech...")
    with wave.open(buffer, 'wb') as wav_file:
        wav_file.setframerate(voice.config.sample_rate)
        wav_file.setsampwidth(2)
        wav_file.setnchannels(1)
        voice.synthesize(text, wav_file, sentence_silence=sentence_silence, length_scale=length_scale)

    buffer.seek(0)
    logger.debug("Speech synthesis complete.")
    return buffer

@app.route('/')
def index():
    return '''
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>TTS Server</title>
        <style>
            body { font-family: Arial, sans-serif; margin: 40px; }
            h1 { color: #333; }
            p { font-size: 1.2em; }
            code { background: #f4f4f4; padding: 2px 4px; border-radius: 4px; }
        </style>
    </head>
    <body>
        <h1>TTS Server is Running</h1>
        <p>Use the <code>/tts</code> endpoint to synthesize speech.</p>
        <p>Send a POST request with JSON data containing the <code>text</code>, <code>sentence_silence</code>, and <code>length_scale</code> parameters.</p>
    </body>
    </html>
    '''

@app.route('/tts', methods=['POST'])
def tts():
    data = request.json
    if not data:
        logger.error("No data received in request.")
        return jsonify({"error": "No data provided"}), 400

    text = data.get('text', '')
    sentence_silence = float(data.get('sentence_silence', 0.1))
    length_scale = float(data.get('length_scale', 1.0))

    if not text:
        logger.error("No text provided in request.")
        return jsonify({"error": "Text parameter is required"}), 400

    logger.info(f"Received request: text={text}, sentence_silence={sentence_silence}, length_scale={length_scale}")
    try:
        audio_buffer = synthesize_speech(text, sentence_silence, length_scale)
    except Exception as e:
        logger.exception("Error during speech synthesis.")
        return jsonify({"error": str(e)}), 500

    return send_file(audio_buffer, mimetype="audio/wav", as_attachment=True, download_name="output.wav")

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=7860)