Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
92a440c
1
Parent(s):
dbf9701
try to make text to speech work on zero GPU
Browse files
app.py
CHANGED
@@ -20,10 +20,24 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
20 |
vq_model = RQBottleneckTransformer.load_model(
|
21 |
"whisper-vq-stoks-medium-en+pl-fixed.model"
|
22 |
).to(device)
|
23 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
@spaces.GPU
|
26 |
def audio_to_sound_tokens_whisperspeech(audio_path):
|
|
|
27 |
wav, sr = torchaudio.load(audio_path)
|
28 |
if sr != 16000:
|
29 |
wav = torchaudio.functional.resample(wav, sr, 16000)
|
@@ -36,6 +50,7 @@ def audio_to_sound_tokens_whisperspeech(audio_path):
|
|
36 |
|
37 |
@spaces.GPU
|
38 |
def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
|
|
|
39 |
wav, sr = torchaudio.load(audio_path)
|
40 |
if sr != 16000:
|
41 |
wav = torchaudio.functional.resample(wav, sr, 16000)
|
@@ -45,21 +60,6 @@ def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
|
|
45 |
|
46 |
result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
|
47 |
return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
|
48 |
-
|
49 |
-
tts = TTSProcessor(device)
|
50 |
-
use_8bit = False
|
51 |
-
llm_path = "homebrewltd/Llama3.1-s-instruct-2024-08-19-epoch-3"
|
52 |
-
tokenizer = AutoTokenizer.from_pretrained(llm_path)
|
53 |
-
model_kwargs = {}
|
54 |
-
if use_8bit:
|
55 |
-
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
56 |
-
load_in_8bit=True,
|
57 |
-
llm_int8_enable_fp32_cpu_offload=False,
|
58 |
-
llm_int8_has_fp16_weight=False,
|
59 |
-
)
|
60 |
-
else:
|
61 |
-
model_kwargs["torch_dtype"] = torch.bfloat16
|
62 |
-
model = AutoModelForCausalLM.from_pretrained(llm_path, **model_kwargs).to(device)
|
63 |
# print(tokenizer.encode("<|sound_0001|>", add_special_tokens=False))# return the audio tensor
|
64 |
# print(tokenizer.eos_token)
|
65 |
|
@@ -74,6 +74,7 @@ def text_to_audio_file(text):
|
|
74 |
# remove the last character if it is a period
|
75 |
if text_split[-1] == ".":
|
76 |
text_split = text_split[:-1]
|
|
|
77 |
tts.convert_text_to_audio_file(text, temp_file)
|
78 |
# logging.info(f"Saving audio to {temp_file}")
|
79 |
# torchaudio.save(temp_file, audio.cpu(), sample_rate=24000)
|
@@ -165,34 +166,6 @@ for file in os.listdir("./bad_examples"):
|
|
165 |
examples = []
|
166 |
examples.extend(good_examples)
|
167 |
examples.extend(bad_examples)
|
168 |
-
# with gr.Blocks() as iface:
|
169 |
-
# gr.Markdown("# Llama3-S: A Speech & Text Fusion Model Checkpoint from Homebrew")
|
170 |
-
# gr.Markdown("Enter text or upload a .wav file to generate text based on its content.")
|
171 |
-
|
172 |
-
# with gr.Row():
|
173 |
-
# input_type = gr.Radio(["text", "audio"], label="Input Type", value="audio")
|
174 |
-
# text_input = gr.Textbox(label="Text Input", visible=False)
|
175 |
-
# audio_input = gr.Audio(sources=["upload"], type="filepath", label="Upload audio", visible=True)
|
176 |
-
|
177 |
-
# output = gr.Textbox(label="Generated Text")
|
178 |
-
|
179 |
-
# submit_button = gr.Button("Submit")
|
180 |
-
|
181 |
-
# input_type.change(
|
182 |
-
# update_visibility,
|
183 |
-
# inputs=[input_type],
|
184 |
-
# outputs=[text_input, audio_input]
|
185 |
-
# )
|
186 |
-
|
187 |
-
# submit_button.click(
|
188 |
-
# process_input,
|
189 |
-
# inputs=[input_type, text_input, audio_input],
|
190 |
-
# outputs=[output]
|
191 |
-
# )
|
192 |
-
|
193 |
-
# gr.Examples(examples, inputs=[audio_input])
|
194 |
-
|
195 |
-
# iface.launch(server_name="127.0.0.1", server_port=8080)
|
196 |
with gr.Blocks() as iface:
|
197 |
gr.Markdown("# Llama3-1-S: checkpoint Aug 19, 2024")
|
198 |
gr.Markdown("Enter text to convert to audio, then submit the audio to generate text or Upload Audio")
|
|
|
20 |
vq_model = RQBottleneckTransformer.load_model(
|
21 |
"whisper-vq-stoks-medium-en+pl-fixed.model"
|
22 |
).to(device)
|
23 |
+
# tts = TTSProcessor('cpu')
|
24 |
+
use_8bit = False
|
25 |
+
llm_path = "homebrewltd/Llama3.1-s-instruct-2024-08-19-epoch-3"
|
26 |
+
tokenizer = AutoTokenizer.from_pretrained(llm_path)
|
27 |
+
model_kwargs = {}
|
28 |
+
if use_8bit:
|
29 |
+
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
30 |
+
load_in_8bit=True,
|
31 |
+
llm_int8_enable_fp32_cpu_offload=False,
|
32 |
+
llm_int8_has_fp16_weight=False,
|
33 |
+
)
|
34 |
+
else:
|
35 |
+
model_kwargs["torch_dtype"] = torch.bfloat16
|
36 |
+
model = AutoModelForCausalLM.from_pretrained(llm_path, **model_kwargs).to(device)
|
37 |
|
38 |
@spaces.GPU
|
39 |
def audio_to_sound_tokens_whisperspeech(audio_path):
|
40 |
+
vq_model.ensure_whisper('cuda')
|
41 |
wav, sr = torchaudio.load(audio_path)
|
42 |
if sr != 16000:
|
43 |
wav = torchaudio.functional.resample(wav, sr, 16000)
|
|
|
50 |
|
51 |
@spaces.GPU
|
52 |
def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
|
53 |
+
vq_model.ensure_whisper('cuda')
|
54 |
wav, sr = torchaudio.load(audio_path)
|
55 |
if sr != 16000:
|
56 |
wav = torchaudio.functional.resample(wav, sr, 16000)
|
|
|
60 |
|
61 |
result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
|
62 |
return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
# print(tokenizer.encode("<|sound_0001|>", add_special_tokens=False))# return the audio tensor
|
64 |
# print(tokenizer.eos_token)
|
65 |
|
|
|
74 |
# remove the last character if it is a period
|
75 |
if text_split[-1] == ".":
|
76 |
text_split = text_split[:-1]
|
77 |
+
tts = TTSProcessor("cuda")
|
78 |
tts.convert_text_to_audio_file(text, temp_file)
|
79 |
# logging.info(f"Saving audio to {temp_file}")
|
80 |
# torchaudio.save(temp_file, audio.cpu(), sample_rate=24000)
|
|
|
166 |
examples = []
|
167 |
examples.extend(good_examples)
|
168 |
examples.extend(bad_examples)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
with gr.Blocks() as iface:
|
170 |
gr.Markdown("# Llama3-1-S: checkpoint Aug 19, 2024")
|
171 |
gr.Markdown("Enter text to convert to audio, then submit the audio to generate text or Upload Audio")
|