vuxuanhoan commited on
Commit
20bc263
·
verified ·
1 Parent(s): 431f3dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -75
app.py CHANGED
@@ -1,85 +1,121 @@
1
  import gradio as gr
2
- import edge_tts
3
- import io
4
  import os
5
  import time
 
 
6
  import asyncio
7
- from docx import Document
8
-
9
- AUDIO_DIR = 'audio_files' # Thư mục để lưu tệp âm thanh
10
- MAX_FILE_AGE = 24 * 60 * 60 # Thời gian lưu trữ tệp âm thanh (24 giờ)
11
-
12
- # Hàm để lấy tất cả các giọng nói có sẵn
13
- async def get_voices():
14
- voices = await edge_tts.list_voices()
15
- return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
16
-
17
- async def text_to_speech(text, lang):
18
- tts = edge_tts.Communicate(text, voice=lang)
19
-
20
- # Đường dẫn cho tệp âm thanh
21
- os.makedirs(AUDIO_DIR, exist_ok=True) # Tạo thư mục nếu chưa tồn tại
22
- audio_file_name = f"{time.time()}.mp3"
23
- audio_file_path = os.path.join(AUDIO_DIR, audio_file_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- # Lưu âm thanh vào tệp
26
- await tts.save(audio_file_path) # Lưu trực tiếp vào đường dẫn hợp lệ
27
- delete_old_audio_files() # Xóa các tệp âm thanh cũ
28
- return audio_file_path, audio_file_path # Trả về đường dẫn tệp âm thanh
29
 
30
- def delete_old_audio_files():
31
- now = time.time()
32
- for file_name in os.listdir(AUDIO_DIR):
33
- file_path = os.path.join(AUDIO_DIR, file_name)
34
- if now - os.path.getmtime(file_path) > MAX_FILE_AGE:
35
- os.remove(file_path)
 
 
 
 
 
 
 
 
36
 
37
- async def txt_to_speech(file, lang):
38
- with open(file.name, 'r') as f:
39
- text = f.read()
40
- return await text_to_speech(text, lang)
41
 
42
- async def docx_to_speech(file, lang):
43
- doc = Document(file.name)
44
- text = "\n".join([para.text for para in doc.paragraphs]) # Lấy tất cả văn bản từ các đoạn
45
- return await text_to_speech(text, lang)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  # Tạo giao diện Gradio
48
- async def create_interface():
49
- voices = await get_voices() # Lấy danh sách giọng nói
50
-
51
- with gr.Blocks() as iface:
52
- with gr.Tab("Text to Speech"):
53
- gr.Markdown("### Convert text to speech")
54
- text_input = gr.Textbox(lines=10, label="Enter your text here:")
55
- lang_input = gr.Dropdown(choices=list(voices.keys()), label="Select language:") # Cập nhật dropdown giọng nói
56
-
57
- audio_output, file_output = gr.Audio(label="Audio"), gr.File(label="Audio File")
58
- gr.Button("Convert").click(fn=lambda text, lang: asyncio.run(text_to_speech(text, voices[lang])),
59
- inputs=[text_input, lang_input],
60
- outputs=[audio_output, file_output])
61
-
62
- with gr.Tab("TXT to Speech"):
63
- gr.Markdown("### Convert .txt file to speech")
64
- file_input = gr.File(label="Upload your .txt file")
65
- lang_input_file = gr.Dropdown(choices=list(voices.keys()), label="Select language:") # Cập nhật dropdown giọng nói
66
-
67
- audio_output_file, file_output_file = gr.Audio(label="Audio"), gr.File(label="Audio File")
68
- gr.Button("Convert").click(fn=lambda file, lang: asyncio.run(txt_to_speech(file, voices[lang])),
69
- inputs=[file_input, lang_input_file],
70
- outputs=[audio_output_file, file_output_file])
71
-
72
- with gr.Tab("DOCX to Speech"):
73
- gr.Markdown("### Convert .docx file to speech")
74
- docx_file_input = gr.File(label="Upload your .docx file")
75
- lang_input_docx = gr.Dropdown(choices=list(voices.keys()), label="Select language:") # Cập nhật dropdown giọng nói
76
-
77
- audio_output_docx, file_output_docx = gr.Audio(label="Audio"), gr.File(label="Audio File")
78
- gr.Button("Convert").click(fn=lambda file, lang: asyncio.run(docx_to_speech(file, voices[lang])),
79
- inputs=[docx_file_input, lang_input_docx],
80
- outputs=[audio_output_docx, file_output_docx])
81
-
82
- iface.launch(enable_queue=True)
83
-
84
- # Chạy ứng dụng
85
- asyncio.run(create_interface())
 
1
  import gradio as gr
 
 
2
  import os
3
  import time
4
+ import uuid
5
+ import re
6
  import asyncio
7
+ import torchaudio
8
+ from huggingface_hub import HfApi, hf_hub_download, snapshot_download
9
+ from TTS.tts.configs.xtts_config import XttsConfig
10
+ from TTS.tts.models.xtts import Xtts
11
+ from vinorm import TTSnorm
12
+
13
+ # download for mecab
14
+ os.system("python -m unidic download")
15
+
16
+ HF_TOKEN = os.environ.get("HF_TOKEN")
17
+ api = HfApi(token=HF_TOKEN)
18
+
19
+ # This will trigger downloading model
20
+ print("Downloading if not downloaded viXTTS")
21
+ checkpoint_dir = "model/"
22
+ repo_id = "capleaf/viXTTS"
23
+ use_deepspeed = False
24
+
25
+ os.makedirs(checkpoint_dir, exist_ok=True)
26
+
27
+ required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
28
+ files_in_dir = os.listdir(checkpoint_dir)
29
+ if not all(file in files_in_dir for file in required_files):
30
+ snapshot_download(
31
+ repo_id=repo_id,
32
+ repo_type="model",
33
+ local_dir=checkpoint_dir,
34
+ )
35
+ hf_hub_download(
36
+ repo_id="coqui/XTTS-v2",
37
+ filename="speakers_xtts.pth",
38
+ local_dir=checkpoint_dir,
39
+ )
40
+
41
+ xtts_config = os.path.join(checkpoint_dir, "config.json")
42
+ config = XttsConfig()
43
+ config.load_json(xtts_config)
44
+ MODEL = Xtts.init_from_config(config)
45
+ MODEL.load_checkpoint(
46
+ config, checkpoint_dir=checkpoint_dir, use_deepspeed=use_deepspeed
47
+ )
48
+
49
+ if torch.cuda.is_available():
50
+ MODEL.cuda()
51
 
52
+ supported_languages = config.languages
53
+ if not "vi" in supported_languages:
54
+ supported_languages.append("vi")
 
55
 
56
+ def normalize_vietnamese_text(text):
57
+ text = (
58
+ TTSnorm(text, unknown=False, lower=False, rule=True)
59
+ .replace("..", ".")
60
+ .replace("!.", "!")
61
+ .replace("?.", "?")
62
+ .replace(" .", ".")
63
+ .replace(" ,", ",")
64
+ .replace('"', "")
65
+ .replace("'", "")
66
+ .replace("AI", "Ây Ai")
67
+ .replace("A.I", "Ây Ai")
68
+ )
69
+ return text
70
 
71
+ async def text_to_speech(text, lang, audio_file_path):
72
+ if lang not in supported_languages:
73
+ return None, "Language not supported."
 
74
 
75
+ if len(text) < 2:
76
+ return None, "Please provide a longer text."
77
+
78
+ if len(text) > 250:
79
+ return None, "Text is too long, please keep it under 250 characters."
80
+
81
+ if lang == "vi":
82
+ text = normalize_vietnamese_text(text)
83
+
84
+ try:
85
+ print("Generating new audio...")
86
+ out = MODEL.inference(
87
+ text,
88
+ lang,
89
+ gpt_cond_latent=None,
90
+ speaker_embedding=None,
91
+ repetition_penalty=5.0,
92
+ temperature=0.75,
93
+ enable_text_splitting=True,
94
+ )
95
+ torchaudio.save(audio_file_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
96
+ return audio_file_path, None
97
+ except Exception as e:
98
+ return None, f"Error during synthesis: {str(e)}"
99
+
100
+ # Thư mục để lưu tệp âm thanh
101
+ AUDIO_DIR = 'audio_files'
102
+ os.makedirs(AUDIO_DIR, exist_ok=True)
103
+
104
+ async def convert_text_to_speech(text, lang):
105
+ audio_file_name = f"{time.time()}.wav"
106
+ audio_file_path = os.path.join(AUDIO_DIR, audio_file_name)
107
+ return await text_to_speech(text, lang, audio_file_path)
108
 
109
  # Tạo giao diện Gradio
110
+ with gr.Blocks() as iface:
111
+ with gr.Tab("Text to Speech"):
112
+ gr.Markdown("### Convert text to speech")
113
+ text_input = gr.Textbox(lines=10, label="Enter your text here:")
114
+ lang_input = gr.Dropdown(choices=supported_languages, label="Select language:")
115
+
116
+ audio_output, file_output = gr.Audio(label="Audio"), gr.File(label="Audio File")
117
+ gr.Button("Convert").click(fn=lambda text, lang: asyncio.run(convert_text_to_speech(text, lang)),
118
+ inputs=[text_input, lang_input],
119
+ outputs=[audio_output, file_output])
120
+
121
+ iface.launch(enable_queue=True)