Spaces:

jan-hq
/

Llama3.1-s-v0.2

Running on Zero

App Files Files Community

bachvudinh commited on Aug 22, 2024

Commit

92a440c

1 Parent(s): dbf9701

try to make text to speech work on zero GPU

Browse files

Files changed (1) hide show

app.py +17 -44

app.py CHANGED Viewed

@@ -20,10 +20,24 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 vq_model = RQBottleneckTransformer.load_model(
         "whisper-vq-stoks-medium-en+pl-fixed.model"
     ).to(device)
-# vq_model.ensure_whisper(device)
 @spaces.GPU
 def audio_to_sound_tokens_whisperspeech(audio_path):
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
         wav = torchaudio.functional.resample(wav, sr, 16000)
@@ -36,6 +50,7 @@ def audio_to_sound_tokens_whisperspeech(audio_path):
 @spaces.GPU
 def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
         wav = torchaudio.functional.resample(wav, sr, 16000)
@@ -45,21 +60,6 @@ def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
     result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
     return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
-tts = TTSProcessor(device)
-use_8bit = False
-llm_path = "homebrewltd/Llama3.1-s-instruct-2024-08-19-epoch-3"
-tokenizer = AutoTokenizer.from_pretrained(llm_path)
-model_kwargs = {}
-if use_8bit:
-    model_kwargs["quantization_config"] = BitsAndBytesConfig(
-        load_in_8bit=True,
-        llm_int8_enable_fp32_cpu_offload=False,
-        llm_int8_has_fp16_weight=False,
-    )
-else:
-    model_kwargs["torch_dtype"] = torch.bfloat16
-model = AutoModelForCausalLM.from_pretrained(llm_path, **model_kwargs).to(device)
 # print(tokenizer.encode("<|sound_0001|>", add_special_tokens=False))# return the audio tensor
 # print(tokenizer.eos_token)
@@ -74,6 +74,7 @@ def text_to_audio_file(text):
     # remove the last character if it is a period
     if text_split[-1] == ".":
         text_split = text_split[:-1]
     tts.convert_text_to_audio_file(text, temp_file)
     # logging.info(f"Saving audio to {temp_file}")
     # torchaudio.save(temp_file, audio.cpu(), sample_rate=24000)
@@ -165,34 +166,6 @@ for file in os.listdir("./bad_examples"):
 examples = []
 examples.extend(good_examples)
 examples.extend(bad_examples)
-# with gr.Blocks() as iface:
-#     gr.Markdown("# Llama3-S: A Speech & Text Fusion Model Checkpoint from Homebrew")
-#     gr.Markdown("Enter text or upload a .wav file to generate text based on its content.")
-#     with gr.Row():
-#         input_type = gr.Radio(["text", "audio"], label="Input Type", value="audio")
-#         text_input = gr.Textbox(label="Text Input", visible=False)
-#         audio_input = gr.Audio(sources=["upload"], type="filepath", label="Upload audio", visible=True)
-#     output = gr.Textbox(label="Generated Text")
-#     submit_button = gr.Button("Submit")
-#     input_type.change(
-#         update_visibility,
-#         inputs=[input_type],
-#         outputs=[text_input, audio_input]
-#     )
-#     submit_button.click(
-#         process_input,
-#         inputs=[input_type, text_input, audio_input],
-#         outputs=[output]
-#     )
-#     gr.Examples(examples, inputs=[audio_input])
-# iface.launch(server_name="127.0.0.1", server_port=8080)
 with gr.Blocks() as iface:
     gr.Markdown("# Llama3-1-S: checkpoint Aug 19, 2024")
     gr.Markdown("Enter text to convert to audio, then submit the audio to generate text or Upload Audio")

 vq_model = RQBottleneckTransformer.load_model(
         "whisper-vq-stoks-medium-en+pl-fixed.model"
     ).to(device)
+# tts = TTSProcessor('cpu')
+use_8bit = False
+llm_path = "homebrewltd/Llama3.1-s-instruct-2024-08-19-epoch-3"
+tokenizer = AutoTokenizer.from_pretrained(llm_path)
+model_kwargs = {}
+if use_8bit:
+    model_kwargs["quantization_config"] = BitsAndBytesConfig(
+        load_in_8bit=True,
+        llm_int8_enable_fp32_cpu_offload=False,
+        llm_int8_has_fp16_weight=False,
+    )
+else:
+    model_kwargs["torch_dtype"] = torch.bfloat16
+model = AutoModelForCausalLM.from_pretrained(llm_path, **model_kwargs).to(device)
 @spaces.GPU
 def audio_to_sound_tokens_whisperspeech(audio_path):
+    vq_model.ensure_whisper('cuda')
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
         wav = torchaudio.functional.resample(wav, sr, 16000)
 @spaces.GPU
 def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
+    vq_model.ensure_whisper('cuda')
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
         wav = torchaudio.functional.resample(wav, sr, 16000)
     result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
     return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
 # print(tokenizer.encode("<|sound_0001|>", add_special_tokens=False))# return the audio tensor
 # print(tokenizer.eos_token)
     # remove the last character if it is a period
     if text_split[-1] == ".":
         text_split = text_split[:-1]
+    tts = TTSProcessor("cuda")
     tts.convert_text_to_audio_file(text, temp_file)
     # logging.info(f"Saving audio to {temp_file}")
     # torchaudio.save(temp_file, audio.cpu(), sample_rate=24000)
 examples = []
 examples.extend(good_examples)
 examples.extend(bad_examples)
 with gr.Blocks() as iface:
     gr.Markdown("# Llama3-1-S: checkpoint Aug 19, 2024")
     gr.Markdown("Enter text to convert to audio, then submit the audio to generate text or Upload Audio")