bgtts / app.py
englissi's picture
Update app.py
2ec67ac verified
raw
history blame
1.4 kB
import gradio as gr
from gtts import gTTS
from pydub import AudioSegment
import numpy as np
import os
import io
def text_to_speech(prompt):
# gTTS๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ Bulgarian ํ…์ŠคํŠธ๋ฅผ ์Œ์„ฑ์œผ๋กœ ๋ณ€ํ™˜
tts = gTTS(text=prompt, lang="bg")
audio_file = "output.mp3"
tts.save(audio_file)
# pydub์„ ์‚ฌ์šฉํ•˜์—ฌ mp3 ํŒŒ์ผ์„ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
sound = AudioSegment.from_mp3(audio_file)
# pydub์˜ raw data๋ฅผ numpy ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜ (16๋น„ํŠธ ์ •์ˆ˜ํ˜•)
samples = np.array(sound.get_array_of_samples())
# ๋ชจ๋…ธ ์ฑ„๋„์ด ์•„๋‹ˆ๋ผ๋ฉด, ์ฑ„๋„์„ ํ•ฉ์นฉ๋‹ˆ๋‹ค.
if sound.channels > 1:
samples = samples.reshape((-1, sound.channels))
samples = samples.mean(axis=1).astype(np.int16)
sample_rate = sound.frame_rate
# ์ž„์‹œ mp3 ํŒŒ์ผ ์‚ญ์ œ
os.remove(audio_file)
# gr.Audio(type="numpy")๋Š” (numpy array, sample_rate) ํŠœํ”Œ์„ ๊ธฐ๋Œ€ํ•ฉ๋‹ˆ๋‹ค.
return samples, sample_rate
with gr.Blocks() as demo:
gr.Markdown("## Bulgarian Text-to-Speech (TTS)")
with gr.Row():
input_prompt = gr.Textbox(label="Enter a prompt in Bulgarian:")
output_audio = gr.Audio(label="Generated Speech", type="numpy")
generate_button = gr.Button("Generate Speech")
generate_button.click(text_to_speech, inputs=input_prompt, outputs=output_audio)
if __name__ == "__main__":
demo.launch()