edge-TTS

Sleeping

App Files Files Community

vuxuanhoan commited on Oct 12, 2024

Commit

20bc263

verified ·

1 Parent(s): 431f3dc

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -75

app.py CHANGED Viewed

@@ -1,85 +1,121 @@
 import gradio as gr
-import edge_tts
-import io
 import os
 import time
 import asyncio
-from docx import Document
-AUDIO_DIR = 'audio_files'  # Thư mục để lưu tệp âm thanh
-MAX_FILE_AGE = 24 * 60 * 60  # Thời gian lưu trữ tệp âm thanh (24 giờ)
-# Hàm để lấy tất cả các giọng nói có sẵn
-async def get_voices():
-    voices = await edge_tts.list_voices()
-    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
-async def text_to_speech(text, lang):
-    tts = edge_tts.Communicate(text, voice=lang)
-    # Đường dẫn cho tệp âm thanh
-    os.makedirs(AUDIO_DIR, exist_ok=True)  # Tạo thư mục nếu chưa tồn tại
-    audio_file_name = f"{time.time()}.mp3"
-    audio_file_path = os.path.join(AUDIO_DIR, audio_file_name)
-    # Lưu âm thanh vào tệp
-    await tts.save(audio_file_path)  # Lưu trực tiếp vào đường dẫn hợp lệ
-    delete_old_audio_files()  # Xóa các tệp âm thanh cũ
-    return audio_file_path, audio_file_path  # Trả về đường dẫn tệp âm thanh
-def delete_old_audio_files():
-    now = time.time()
-    for file_name in os.listdir(AUDIO_DIR):
-        file_path = os.path.join(AUDIO_DIR, file_name)
-        if now - os.path.getmtime(file_path) > MAX_FILE_AGE:
-            os.remove(file_path)
-async def txt_to_speech(file, lang):
-    with open(file.name, 'r') as f:
-        text = f.read()
-    return await text_to_speech(text, lang)
-async def docx_to_speech(file, lang):
-    doc = Document(file.name)
-    text = "\n".join([para.text for para in doc.paragraphs])  # Lấy tất cả văn bản từ các đoạn
-    return await text_to_speech(text, lang)
 # Tạo giao diện Gradio
-async def create_interface():
-    voices = await get_voices()  # Lấy danh sách giọng nói
-    with gr.Blocks() as iface:
-        with gr.Tab("Text to Speech"):
-            gr.Markdown("### Convert text to speech")
-            text_input = gr.Textbox(lines=10, label="Enter your text here:")
-            lang_input = gr.Dropdown(choices=list(voices.keys()), label="Select language:")  # Cập nhật dropdown giọng nói
-            audio_output, file_output = gr.Audio(label="Audio"), gr.File(label="Audio File")
-            gr.Button("Convert").click(fn=lambda text, lang: asyncio.run(text_to_speech(text, voices[lang])),
-                                        inputs=[text_input, lang_input],
-                                        outputs=[audio_output, file_output])
-        with gr.Tab("TXT to Speech"):
-            gr.Markdown("### Convert .txt file to speech")
-            file_input = gr.File(label="Upload your .txt file")
-            lang_input_file = gr.Dropdown(choices=list(voices.keys()), label="Select language:")  # Cập nhật dropdown giọng nói
-            audio_output_file, file_output_file = gr.Audio(label="Audio"), gr.File(label="Audio File")
-            gr.Button("Convert").click(fn=lambda file, lang: asyncio.run(txt_to_speech(file, voices[lang])),
-                                        inputs=[file_input, lang_input_file],
-                                        outputs=[audio_output_file, file_output_file])
-        with gr.Tab("DOCX to Speech"):
-            gr.Markdown("### Convert .docx file to speech")
-            docx_file_input = gr.File(label="Upload your .docx file")
-            lang_input_docx = gr.Dropdown(choices=list(voices.keys()), label="Select language:")  # Cập nhật dropdown giọng nói
-            audio_output_docx, file_output_docx = gr.Audio(label="Audio"), gr.File(label="Audio File")
-            gr.Button("Convert").click(fn=lambda file, lang: asyncio.run(docx_to_speech(file, voices[lang])),
-                                        inputs=[docx_file_input, lang_input_docx],
-                                        outputs=[audio_output_docx, file_output_docx])
-    iface.launch(enable_queue=True)
-# Chạy ứng dụng
-asyncio.run(create_interface())

 import gradio as gr
 import os
 import time
+import uuid
+import re
 import asyncio
+import torchaudio
+from huggingface_hub import HfApi, hf_hub_download, snapshot_download
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+from vinorm import TTSnorm
+# download for mecab
+os.system("python -m unidic download")
+HF_TOKEN = os.environ.get("HF_TOKEN")
+api = HfApi(token=HF_TOKEN)
+# This will trigger downloading model
+print("Downloading if not downloaded viXTTS")
+checkpoint_dir = "model/"
+repo_id = "capleaf/viXTTS"
+use_deepspeed = False
+os.makedirs(checkpoint_dir, exist_ok=True)
+required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
+files_in_dir = os.listdir(checkpoint_dir)
+if not all(file in files_in_dir for file in required_files):
+    snapshot_download(
+        repo_id=repo_id,
+        repo_type="model",
+        local_dir=checkpoint_dir,
+    )
+    hf_hub_download(
+        repo_id="coqui/XTTS-v2",
+        filename="speakers_xtts.pth",
+        local_dir=checkpoint_dir,
+    )
+xtts_config = os.path.join(checkpoint_dir, "config.json")
+config = XttsConfig()
+config.load_json(xtts_config)
+MODEL = Xtts.init_from_config(config)
+MODEL.load_checkpoint(
+    config, checkpoint_dir=checkpoint_dir, use_deepspeed=use_deepspeed
+)
+if torch.cuda.is_available():
+    MODEL.cuda()
+supported_languages = config.languages
+if not "vi" in supported_languages:
+    supported_languages.append("vi")
+def normalize_vietnamese_text(text):
+    text = (
+        TTSnorm(text, unknown=False, lower=False, rule=True)
+        .replace("..", ".")
+        .replace("!.", "!")
+        .replace("?.", "?")
+        .replace(" .", ".")
+        .replace(" ,", ",")
+        .replace('"', "")
+        .replace("'", "")
+        .replace("AI", "Ây Ai")
+        .replace("A.I", "Ây Ai")
+    )
+    return text
+async def text_to_speech(text, lang, audio_file_path):
+    if lang not in supported_languages:
+        return None, "Language not supported."
+    if len(text) < 2:
+        return None, "Please provide a longer text."
+    if len(text) > 250:
+        return None, "Text is too long, please keep it under 250 characters."
+    if lang == "vi":
+        text = normalize_vietnamese_text(text)
+    try:
+        print("Generating new audio...")
+        out = MODEL.inference(
+            text,
+            lang,
+            gpt_cond_latent=None,
+            speaker_embedding=None,
+            repetition_penalty=5.0,
+            temperature=0.75,
+            enable_text_splitting=True,
+        )
+        torchaudio.save(audio_file_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
+        return audio_file_path, None
+    except Exception as e:
+        return None, f"Error during synthesis: {str(e)}"
+# Thư mục để lưu tệp âm thanh
+AUDIO_DIR = 'audio_files'
+os.makedirs(AUDIO_DIR, exist_ok=True)
+async def convert_text_to_speech(text, lang):
+    audio_file_name = f"{time.time()}.wav"
+    audio_file_path = os.path.join(AUDIO_DIR, audio_file_name)
+    return await text_to_speech(text, lang, audio_file_path)
 # Tạo giao diện Gradio
+with gr.Blocks() as iface:
+    with gr.Tab("Text to Speech"):
+        gr.Markdown("### Convert text to speech")
+        text_input = gr.Textbox(lines=10, label="Enter your text here:")
+        lang_input = gr.Dropdown(choices=supported_languages, label="Select language:")
+        audio_output, file_output = gr.Audio(label="Audio"), gr.File(label="Audio File")
+        gr.Button("Convert").click(fn=lambda text, lang: asyncio.run(convert_text_to_speech(text, lang)),
+                                    inputs=[text_input, lang_input],
+                                    outputs=[audio_output, file_output])
+iface.launch(enable_queue=True)