bachvudinh commited on
Commit
c3d86d3
1 Parent(s): 87736a3
Files changed (2) hide show
  1. app copy.py +0 -254
  2. app.py +251 -4
app copy.py DELETED
@@ -1,254 +0,0 @@
1
- import gradio as gr
2
- import torch
3
- import torchaudio
4
- from encodec import EncodecModel
5
- from whisperspeech.vq_stoks import RQBottleneckTransformer
6
- from encodec.utils import convert_audio
7
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
8
- from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
9
- from threading import Thread
10
- import logging
11
- import os
12
- from generate_audio import (
13
- TTSProcessor,
14
- )
15
- import uuid
16
-
17
- device = "cuda" if torch.cuda.is_available() else "cpu"
18
- vq_model = RQBottleneckTransformer.load_model(
19
- "whisper-vq-stoks-medium-en+pl-fixed.model"
20
- ).to(device)
21
- vq_model.ensure_whisper(device)
22
-
23
- def audio_to_sound_tokens_whisperspeech(audio_path):
24
- wav, sr = torchaudio.load(audio_path)
25
- if sr != 16000:
26
- wav = torchaudio.functional.resample(wav, sr, 16000)
27
- with torch.no_grad():
28
- codes = vq_model.encode_audio(wav.to(device))
29
- codes = codes[0].cpu().tolist()
30
-
31
- result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
32
- return f'<|sound_start|>{result}<|sound_end|>'
33
- def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
34
- wav, sr = torchaudio.load(audio_path)
35
- if sr != 16000:
36
- wav = torchaudio.functional.resample(wav, sr, 16000)
37
- with torch.no_grad():
38
- codes = vq_model.encode_audio(wav.to(device))
39
- codes = codes[0].cpu().tolist()
40
-
41
- result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
42
- return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
43
- def audio_to_sound_tokens(audio_path, target_bandwidth=1.5, device="cuda"):
44
- model = EncodecModel.encodec_model_24khz()
45
- model.set_target_bandwidth(target_bandwidth)
46
- model.to(device)
47
-
48
- wav, sr = torchaudio.load(audio_path)
49
- wav = convert_audio(wav, sr, model.sample_rate, model.channels)
50
- wav = wav.unsqueeze(0).to(device)
51
-
52
- with torch.no_grad():
53
- encoded_frames = model.encode(wav)
54
- codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
55
-
56
- audio_code1, audio_code2 = codes[0][0], codes[0][1]
57
- flatten_tokens = torch.stack((audio_code1, audio_code2), dim=1).flatten().tolist()
58
- result = ''.join(f'<|sound_{num:04d}|>' for num in flatten_tokens)
59
- return f'<|sound_start|>{result}<|sound_end|>'
60
-
61
- def setup_pipeline(model_path, use_4bit=False, use_8bit=False):
62
- tokenizer = AutoTokenizer.from_pretrained(model_path)
63
- model_kwargs = {"device_map": "auto"}
64
- if use_8bit:
65
- model_kwargs["quantization_config"] = BitsAndBytesConfig(
66
- load_in_8bit=True,
67
- llm_int8_enable_fp32_cpu_offload=False,
68
- llm_int8_has_fp16_weight=False,
69
- )
70
- else:
71
- model_kwargs["torch_dtype"] = torch.bfloat16
72
- model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
73
- return pipeline("text-generation", model=model, tokenizer=tokenizer)
74
-
75
- tts = TTSProcessor(device)
76
- llm_path = "homebrewltd/Llama3.1-s-instruct-2024-08-19-epoch-3"
77
- pipe = setup_pipeline(llm_path, use_8bit=False)
78
- tokenizer = pipe.tokenizer
79
- model = pipe.model
80
- # print(tokenizer.encode("<|sound_0001|>", add_special_tokens=False))# return the audio tensor
81
- # print(tokenizer.eos_token)
82
- def text_to_audio_file(text):
83
- # gen a random id for the audio file
84
- id = str(uuid.uuid4())
85
- temp_file = f"./user_audio/{id}_temp_audio.wav"
86
- text = text
87
- text_split = "_".join(text.lower().split(" "))
88
- # remove the last character if it is a period
89
- if text_split[-1] == ".":
90
- text_split = text_split[:-1]
91
- tts.convert_text_to_audio_file(text, temp_file)
92
- # logging.info(f"Saving audio to {temp_file}")
93
- # torchaudio.save(temp_file, audio.cpu(), sample_rate=24000)
94
- print(f"Saved audio to {temp_file}")
95
- return temp_file
96
- def process_input(input_type, text_input=None, audio_file=None):
97
- # if input_type == "text":
98
- # audio_file = "temp_audio.wav"
99
-
100
- for partial_message in process_audio(audio_file):
101
- yield partial_message
102
-
103
- # if input_type == "text":
104
- # os.remove(audio_file)
105
- def process_transcribe_input(input_type, text_input=None, audio_file=None):
106
- # if input_type == "text":
107
- # audio_file = "temp_audio.wav"
108
-
109
- for partial_message in process_audio(audio_file, transcript=True):
110
- yield partial_message
111
-
112
- # if input_type == "text":
113
- # os.remove(audio_file)
114
- class StopOnTokens(StoppingCriteria):
115
- def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
116
- # encode </s> token
117
- stop_ids = [tokenizer.eos_token_id, 128009] # Adjust this based on your model's tokenizer
118
- for stop_id in stop_ids:
119
- if input_ids[0][-1] == stop_id:
120
- return True
121
- return False
122
- def process_audio(audio_file, transcript=False):
123
- if audio_file is None:
124
- raise ValueError("No audio file provided")
125
-
126
- logging.info(f"Audio file received: {audio_file}")
127
- logging.info(f"Audio file type: {type(audio_file)}")
128
-
129
- sound_tokens = audio_to_sound_tokens_whisperspeech_transcribe(audio_file) if transcript else audio_to_sound_tokens_whisperspeech(audio_file)
130
- logging.info("Sound tokens generated successfully")
131
- # logging.info(f"audio_file: {audio_file.name}")
132
- messages = [
133
- {"role": "user", "content": sound_tokens},
134
- ]
135
-
136
- stop = StopOnTokens()
137
- input_str = tokenizer.apply_chat_template(messages, tokenize=False)
138
- input_ids = tokenizer.encode(input_str, return_tensors="pt")
139
- input_ids = input_ids.to(model.device)
140
-
141
- streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
142
- generation_kwargs = dict(
143
- input_ids=input_ids,
144
- streamer=streamer,
145
- max_new_tokens=1024,
146
- do_sample=False,
147
- stopping_criteria=StoppingCriteriaList([stop])
148
- )
149
-
150
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
151
- thread.start()
152
-
153
- partial_message = ""
154
- for new_token in streamer:
155
- partial_message += new_token
156
- if tokenizer.eos_token in partial_message:
157
- break
158
- partial_message = partial_message.replace("assistant\n\n", "")
159
- yield partial_message
160
- # def stop_generation():
161
- # # This is a placeholder. Implement actual stopping logic here if needed.
162
- # return "Generation stopped.", gr.Button.update(interactive=False)
163
- # take all the examples from the examples folder
164
- good_examples = []
165
- for file in os.listdir("./examples"):
166
- if file.endswith(".wav"):
167
- good_examples.append([f"./examples/{file}"])
168
- bad_examples = []
169
- for file in os.listdir("./bad_examples"):
170
- if file.endswith(".wav"):
171
- bad_examples.append([f"./bad_examples/{file}"])
172
- examples = []
173
- examples.extend(good_examples)
174
- examples.extend(bad_examples)
175
- # with gr.Blocks() as iface:
176
- # gr.Markdown("# Llama3-S: A Speech & Text Fusion Model Checkpoint from Homebrew")
177
- # gr.Markdown("Enter text or upload a .wav file to generate text based on its content.")
178
-
179
- # with gr.Row():
180
- # input_type = gr.Radio(["text", "audio"], label="Input Type", value="audio")
181
- # text_input = gr.Textbox(label="Text Input", visible=False)
182
- # audio_input = gr.Audio(sources=["upload"], type="filepath", label="Upload audio", visible=True)
183
-
184
- # output = gr.Textbox(label="Generated Text")
185
-
186
- # submit_button = gr.Button("Submit")
187
-
188
- # input_type.change(
189
- # update_visibility,
190
- # inputs=[input_type],
191
- # outputs=[text_input, audio_input]
192
- # )
193
-
194
- # submit_button.click(
195
- # process_input,
196
- # inputs=[input_type, text_input, audio_input],
197
- # outputs=[output]
198
- # )
199
-
200
- # gr.Examples(examples, inputs=[audio_input])
201
-
202
- # iface.launch(server_name="127.0.0.1", server_port=8080)
203
- with gr.Blocks() as iface:
204
- gr.Markdown("# Llama3-1-S: checkpoint Aug 19, 2024")
205
- gr.Markdown("Enter text to convert to audio, then submit the audio to generate text or Upload Audio")
206
-
207
- with gr.Row():
208
- input_type = gr.Radio(["text", "audio"], label="Input Type", value="audio")
209
- text_input = gr.Textbox(label="Text Input", visible=False)
210
- audio_input = gr.Audio(label="Audio", type="filepath", visible=True)
211
- # audio_output = gr.Audio(label="Converted Audio", type="filepath", visible=False)
212
-
213
- convert_button = gr.Button("Convert to Audio", visible=False)
214
- submit_button = gr.Button("Submit for Processing")
215
- transcrip_button = gr.Button("Please Transcribe the audio for me")
216
-
217
- text_output = gr.Textbox(label="Generated Text")
218
-
219
- def update_visibility(input_type):
220
- return (gr.update(visible=input_type == "text"),
221
- gr.update(visible=input_type == "text"))
222
- def convert_and_display(text):
223
- audio_file = text_to_audio_file(text)
224
- return audio_file
225
- def process_example(file_path):
226
- return update_visibility("audio")
227
- input_type.change(
228
- update_visibility,
229
- inputs=[input_type],
230
- outputs=[text_input, convert_button]
231
- )
232
-
233
- convert_button.click(
234
- convert_and_display,
235
- inputs=[text_input],
236
- outputs=[audio_input]
237
- )
238
-
239
- submit_button.click(
240
- process_input,
241
- inputs=[input_type, text_input, audio_input],
242
- outputs=[text_output]
243
- )
244
- transcrip_button.click(
245
- process_transcribe_input,
246
- inputs=[input_type, text_input, audio_input],
247
- outputs=[text_output]
248
- )
249
-
250
- gr.Examples(examples, inputs=[audio_input],outputs=[audio_input], fn=process_example)
251
- iface.queue()
252
- iface.launch()
253
- # launch locally
254
- # iface.launch(server_name="0.0.0.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,7 +1,254 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ from encodec import EncodecModel
5
+ from whisperspeech.vq_stoks import RQBottleneckTransformer
6
+ from encodec.utils import convert_audio
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
8
+ from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
9
+ from threading import Thread
10
+ import logging
11
+ import os
12
+ from generate_audio import (
13
+ TTSProcessor,
14
+ )
15
+ import uuid
16
 
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ vq_model = RQBottleneckTransformer.load_model(
19
+ "whisper-vq-stoks-medium-en+pl-fixed.model"
20
+ ).to(device)
21
+ vq_model.ensure_whisper(device)
22
 
23
+ def audio_to_sound_tokens_whisperspeech(audio_path):
24
+ wav, sr = torchaudio.load(audio_path)
25
+ if sr != 16000:
26
+ wav = torchaudio.functional.resample(wav, sr, 16000)
27
+ with torch.no_grad():
28
+ codes = vq_model.encode_audio(wav.to(device))
29
+ codes = codes[0].cpu().tolist()
30
+
31
+ result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
32
+ return f'<|sound_start|>{result}<|sound_end|>'
33
+ def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
34
+ wav, sr = torchaudio.load(audio_path)
35
+ if sr != 16000:
36
+ wav = torchaudio.functional.resample(wav, sr, 16000)
37
+ with torch.no_grad():
38
+ codes = vq_model.encode_audio(wav.to(device))
39
+ codes = codes[0].cpu().tolist()
40
+
41
+ result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
42
+ return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
43
+ def audio_to_sound_tokens(audio_path, target_bandwidth=1.5, device="cuda"):
44
+ model = EncodecModel.encodec_model_24khz()
45
+ model.set_target_bandwidth(target_bandwidth)
46
+ model.to(device)
47
+
48
+ wav, sr = torchaudio.load(audio_path)
49
+ wav = convert_audio(wav, sr, model.sample_rate, model.channels)
50
+ wav = wav.unsqueeze(0).to(device)
51
+
52
+ with torch.no_grad():
53
+ encoded_frames = model.encode(wav)
54
+ codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
55
+
56
+ audio_code1, audio_code2 = codes[0][0], codes[0][1]
57
+ flatten_tokens = torch.stack((audio_code1, audio_code2), dim=1).flatten().tolist()
58
+ result = ''.join(f'<|sound_{num:04d}|>' for num in flatten_tokens)
59
+ return f'<|sound_start|>{result}<|sound_end|>'
60
+
61
+ def setup_pipeline(model_path, use_4bit=False, use_8bit=False):
62
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
63
+ model_kwargs = {"device_map": "auto"}
64
+ if use_8bit:
65
+ model_kwargs["quantization_config"] = BitsAndBytesConfig(
66
+ load_in_8bit=True,
67
+ llm_int8_enable_fp32_cpu_offload=False,
68
+ llm_int8_has_fp16_weight=False,
69
+ )
70
+ else:
71
+ model_kwargs["torch_dtype"] = torch.bfloat16
72
+ model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
73
+ return pipeline("text-generation", model=model, tokenizer=tokenizer)
74
+
75
+ tts = TTSProcessor(device)
76
+ llm_path = "homebrewltd/Llama3.1-s-instruct-2024-08-19-epoch-3"
77
+ pipe = setup_pipeline(llm_path, use_8bit=False)
78
+ tokenizer = pipe.tokenizer
79
+ model = pipe.model
80
+ # print(tokenizer.encode("<|sound_0001|>", add_special_tokens=False))# return the audio tensor
81
+ # print(tokenizer.eos_token)
82
+ def text_to_audio_file(text):
83
+ # gen a random id for the audio file
84
+ id = str(uuid.uuid4())
85
+ temp_file = f"./user_audio/{id}_temp_audio.wav"
86
+ text = text
87
+ text_split = "_".join(text.lower().split(" "))
88
+ # remove the last character if it is a period
89
+ if text_split[-1] == ".":
90
+ text_split = text_split[:-1]
91
+ tts.convert_text_to_audio_file(text, temp_file)
92
+ # logging.info(f"Saving audio to {temp_file}")
93
+ # torchaudio.save(temp_file, audio.cpu(), sample_rate=24000)
94
+ print(f"Saved audio to {temp_file}")
95
+ return temp_file
96
+ def process_input(input_type, text_input=None, audio_file=None):
97
+ # if input_type == "text":
98
+ # audio_file = "temp_audio.wav"
99
+
100
+ for partial_message in process_audio(audio_file):
101
+ yield partial_message
102
+
103
+ # if input_type == "text":
104
+ # os.remove(audio_file)
105
+ def process_transcribe_input(input_type, text_input=None, audio_file=None):
106
+ # if input_type == "text":
107
+ # audio_file = "temp_audio.wav"
108
+
109
+ for partial_message in process_audio(audio_file, transcript=True):
110
+ yield partial_message
111
+
112
+ # if input_type == "text":
113
+ # os.remove(audio_file)
114
+ class StopOnTokens(StoppingCriteria):
115
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
116
+ # encode </s> token
117
+ stop_ids = [tokenizer.eos_token_id, 128009] # Adjust this based on your model's tokenizer
118
+ for stop_id in stop_ids:
119
+ if input_ids[0][-1] == stop_id:
120
+ return True
121
+ return False
122
+ def process_audio(audio_file, transcript=False):
123
+ if audio_file is None:
124
+ raise ValueError("No audio file provided")
125
+
126
+ logging.info(f"Audio file received: {audio_file}")
127
+ logging.info(f"Audio file type: {type(audio_file)}")
128
+
129
+ sound_tokens = audio_to_sound_tokens_whisperspeech_transcribe(audio_file) if transcript else audio_to_sound_tokens_whisperspeech(audio_file)
130
+ logging.info("Sound tokens generated successfully")
131
+ # logging.info(f"audio_file: {audio_file.name}")
132
+ messages = [
133
+ {"role": "user", "content": sound_tokens},
134
+ ]
135
+
136
+ stop = StopOnTokens()
137
+ input_str = tokenizer.apply_chat_template(messages, tokenize=False)
138
+ input_ids = tokenizer.encode(input_str, return_tensors="pt")
139
+ input_ids = input_ids.to(model.device)
140
+
141
+ streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
142
+ generation_kwargs = dict(
143
+ input_ids=input_ids,
144
+ streamer=streamer,
145
+ max_new_tokens=1024,
146
+ do_sample=False,
147
+ stopping_criteria=StoppingCriteriaList([stop])
148
+ )
149
+
150
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
151
+ thread.start()
152
+
153
+ partial_message = ""
154
+ for new_token in streamer:
155
+ partial_message += new_token
156
+ if tokenizer.eos_token in partial_message:
157
+ break
158
+ partial_message = partial_message.replace("assistant\n\n", "")
159
+ yield partial_message
160
+ # def stop_generation():
161
+ # # This is a placeholder. Implement actual stopping logic here if needed.
162
+ # return "Generation stopped.", gr.Button.update(interactive=False)
163
+ # take all the examples from the examples folder
164
+ good_examples = []
165
+ for file in os.listdir("./examples"):
166
+ if file.endswith(".wav"):
167
+ good_examples.append([f"./examples/{file}"])
168
+ bad_examples = []
169
+ for file in os.listdir("./bad_examples"):
170
+ if file.endswith(".wav"):
171
+ bad_examples.append([f"./bad_examples/{file}"])
172
+ examples = []
173
+ examples.extend(good_examples)
174
+ examples.extend(bad_examples)
175
+ # with gr.Blocks() as iface:
176
+ # gr.Markdown("# Llama3-S: A Speech & Text Fusion Model Checkpoint from Homebrew")
177
+ # gr.Markdown("Enter text or upload a .wav file to generate text based on its content.")
178
+
179
+ # with gr.Row():
180
+ # input_type = gr.Radio(["text", "audio"], label="Input Type", value="audio")
181
+ # text_input = gr.Textbox(label="Text Input", visible=False)
182
+ # audio_input = gr.Audio(sources=["upload"], type="filepath", label="Upload audio", visible=True)
183
+
184
+ # output = gr.Textbox(label="Generated Text")
185
+
186
+ # submit_button = gr.Button("Submit")
187
+
188
+ # input_type.change(
189
+ # update_visibility,
190
+ # inputs=[input_type],
191
+ # outputs=[text_input, audio_input]
192
+ # )
193
+
194
+ # submit_button.click(
195
+ # process_input,
196
+ # inputs=[input_type, text_input, audio_input],
197
+ # outputs=[output]
198
+ # )
199
+
200
+ # gr.Examples(examples, inputs=[audio_input])
201
+
202
+ # iface.launch(server_name="127.0.0.1", server_port=8080)
203
+ with gr.Blocks() as iface:
204
+ gr.Markdown("# Llama3-1-S: checkpoint Aug 19, 2024")
205
+ gr.Markdown("Enter text to convert to audio, then submit the audio to generate text or Upload Audio")
206
+
207
+ with gr.Row():
208
+ input_type = gr.Radio(["text", "audio"], label="Input Type", value="audio")
209
+ text_input = gr.Textbox(label="Text Input", visible=False)
210
+ audio_input = gr.Audio(label="Audio", type="filepath", visible=True)
211
+ # audio_output = gr.Audio(label="Converted Audio", type="filepath", visible=False)
212
+
213
+ convert_button = gr.Button("Convert to Audio", visible=False)
214
+ submit_button = gr.Button("Submit for Processing")
215
+ transcrip_button = gr.Button("Please Transcribe the audio for me")
216
+
217
+ text_output = gr.Textbox(label="Generated Text")
218
+
219
+ def update_visibility(input_type):
220
+ return (gr.update(visible=input_type == "text"),
221
+ gr.update(visible=input_type == "text"))
222
+ def convert_and_display(text):
223
+ audio_file = text_to_audio_file(text)
224
+ return audio_file
225
+ def process_example(file_path):
226
+ return update_visibility("audio")
227
+ input_type.change(
228
+ update_visibility,
229
+ inputs=[input_type],
230
+ outputs=[text_input, convert_button]
231
+ )
232
+
233
+ convert_button.click(
234
+ convert_and_display,
235
+ inputs=[text_input],
236
+ outputs=[audio_input]
237
+ )
238
+
239
+ submit_button.click(
240
+ process_input,
241
+ inputs=[input_type, text_input, audio_input],
242
+ outputs=[text_output]
243
+ )
244
+ transcrip_button.click(
245
+ process_transcribe_input,
246
+ inputs=[input_type, text_input, audio_input],
247
+ outputs=[text_output]
248
+ )
249
+
250
+ gr.Examples(examples, inputs=[audio_input],outputs=[audio_input], fn=process_example)
251
+ iface.queue()
252
+ iface.launch()
253
+ # launch locally
254
+ # iface.launch(server_name="0.0.0.0")