Spaces:

prithivMLmods
/

Pocket-Llama

Running on Zero

App Files Files Community

prithivMLmods commited on 8 days ago

Commit

9810ea7

verified ·

1 Parent(s): 3e00b8b

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -88

app.py CHANGED Viewed

@@ -1,100 +1,106 @@
-import torch
 import spaces
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
-from snac import SNAC
-def redistribute_codes(row):
-    """
-    Convert a sequence of token codes into an audio waveform using SNAC.
-    The code assumes each 7 tokens represent one group of instructions.
-    """
-    row_length = row.size(0)
-    new_length = (row_length // 7) * 7
-    trimmed_row = row[:new_length]
-    code_list = [t - 128266 for t in trimmed_row]
-    layer_1, layer_2, layer_3 = [], [], []
-    for i in range((len(code_list) + 1) // 7):
-        layer_1.append(code_list[7 * i][None])
-        layer_2.append(code_list[7 * i + 1][None] - 4096)
-        layer_3.append(code_list[7 * i + 2][None] - (2 * 4096))
-        layer_3.append(code_list[7 * i + 3][None] - (3 * 4096))
-        layer_2.append(code_list[7 * i + 4][None] - (4 * 4096))
-        layer_3.append(code_list[7 * i + 5][None] - (5 * 4096))
-        layer_3.append(code_list[7 * i + 6][None] - (6 * 4096))
-    with torch.no_grad():
-        codes = [
-            torch.concat(layer_1),
-            torch.concat(layer_2),
-            torch.concat(layer_3)
-        ]
-        for i in range(len(codes)):
-            codes[i][codes[i] < 0] = 0
-            codes[i] = codes[i][None]
-        audio_hat = snac_model.decode(codes)
-        return audio_hat.cpu()[0, 0]
-# Load the SNAC model for audio decoding
-snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")
-# Load the single-speaker language model
-tokenizer = AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper')
-model = AutoModelForCausalLM.from_pretrained(
-    'prithivMLmods/Llama-3B-Mono-Cooper', torch_dtype=torch.bfloat16
-).cuda()
-@spaces.GPU
-def generate_audio(text, temperature, top_p, max_new_tokens):
-    """
-    Given input text, generate speech audio.
-    """
-    speaker = "Cooper"
-    prompt = f'<custom_token_3><|begin_of_text|>{speaker}: {text}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>'
-    input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').to('cuda')
-    with torch.no_grad():
-        generated_ids = model.generate(
-            **input_ids,
-            max_new_tokens=max_new_tokens,
-            do_sample=True,
-            temperature=temperature,
-            top_p=top_p,
-            repetition_penalty=1.1,
-            num_return_sequences=1,
-            eos_token_id=128258,
         )
-    row = generated_ids[0, input_ids['input_ids'].shape[1]:]
-    y_tensor = redistribute_codes(row)
-    y_np = y_tensor.detach().cpu().numpy()
-    return (24000, y_np)
-# Gradio Interface
-with gr.Blocks() as demo:
-    gr.Markdown("# Llama-3B-Mono-Cooper - Single Speaker Audio Generation")
-    gr.Markdown("Generate speech audio using the `prithivMLmods/Llama-3B-Mono-Cooper` model.")
-    with gr.Row():
-        text_input = gr.Textbox(lines=4, label="Input Text")
-    with gr.Row():
-        temp_slider = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.9, label="Temperature")
-        top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.8, label="Top-p")
-        tokens_slider = gr.Slider(minimum=100, maximum=2000, step=50, value=1200, label="Max New Tokens")
-    output_audio = gr.Audio(type="numpy", label="Generated Audio")
-    generate_button = gr.Button("Generate Audio")
-    generate_button.click(
-        fn=generate_audio,
-        inputs=[text_input, temp_slider, top_p_slider, tokens_slider],
-        outputs=output_audio
-    )
-if __name__ == "__main__":
     demo.launch()

+import argparse
 import spaces
+import torch
 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="prithivMLmods/Pocket-Llama-3.2-3B-Instruct")
+    parser.add_argument("--max_length", type=int, default=512)
+    parser.add_argument("--do_sample", action="store_true")
+    # This allows ignoring unrecognized arguments, e.g., from Jupyter
+    return parser.parse_known_args()
+def load_model(model_name):
+    """Load model and tokenizer from Hugging Face."""
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.bfloat16,
+        device_map="auto"
+    )
+    return model, tokenizer
+def generate_reply(model, tokenizer, prompt, max_length, do_sample):
+    """Generate text from the model given a prompt."""
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    # We’re returning just the final string; no streaming here
+    output_tokens = model.generate(
+        **inputs,
+        max_length=max_length,
+        do_sample=do_sample
+    )
+    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+@sapces.GPU
+def main():
+    args, _ = get_args()
+    model, tokenizer = load_model(args.model)
+    def respond(user_message, chat_history):
+        """
+        Gradio expects a function that takes the last user message and the
+        conversation history, then returns the updated history.
+        chat_history is a list of (user_message, bot_reply) pairs.
+        """
+        # Build a single text prompt from the conversation so far
+        prompt = ""
+        for (old_user_msg, old_bot_msg) in chat_history:
+            prompt += f"User: {old_user_msg}\nBot: {old_bot_msg}\n"
+        # Add the new user query
+        prompt += f"User: {user_message}\nBot:"
+        # Generate the response
+        bot_message = generate_reply(
+            model=model,
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_length=args.max_length,
+            do_sample=args.do_sample
+        )
+        # In many cases, the model output will contain the entire prompt again,
+        # so we can strip that off or just let it show. If you see repeated
+        # text, you can try to remove the prompt prefix from bot_message.
+        if bot_message.startswith(prompt):
+            bot_message = bot_message[len(prompt):]
+        # Append the new user-message and bot-response to the history
+        chat_history.append((user_message, bot_message))
+        return chat_history, chat_history
+    # Define the Gradio interface
+    with gr.Blocks() as demo:
+        gr.Markdown("<h2 style='text-align: center;'>Chat with Your Model</h2>")
+        # A Chatbot component that will display the conversation
+        chatbot = gr.Chatbot(label="Chat")
+        # A text box for user input
+        user_input = gr.Textbox(
+            show_label=False,
+            placeholder="Type your message here and press Enter"
         )
+        # A button to clear the conversation
+        clear_button = gr.Button("Clear")
+        # When the user hits Enter in the textbox, call 'respond'
+        #   - Inputs: [user_input, chatbot]  (the last user message and history)
+        #   - Outputs: [chatbot, chatbot]    (updates the chatbot display and history)
+        user_input.submit(respond, [user_input, chatbot], [chatbot, chatbot])
+        # Define a helper function for clearing
+        def clear_conversation():
+            return [], []
+        # When "Clear" is clicked, reset the conversation
+        clear_button.click(fn=clear_conversation, outputs=[chatbot, chatbot])
+    # Launch the Gradio app
     demo.launch()
+if __name__ == "__main__":
+    main()