Spaces:

Braszczynski
/

ID2223Lab2

Runtime error

App Files Files Community

Braszczynski commited on Dec 9, 2024

Commit

a9938e0

verified ·

1 Parent(s): cec8b5d

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -14

app.py CHANGED Viewed

@@ -1,19 +1,32 @@
 import gradio as gr
 import torch
-from transformers import AutoTokenizer
-from adapters import AutoAdapterModel
-model_name = "unsloth/Meta-Llama-3.1-8B-Instruct"
-# Load tokenizer with 4-bit quantization
-tokenizer = AutoTokenizer.from_pretrained(model_name, load_in_4bit=True)
-# Load the base model with adapters, ensuring it's loaded in 4-bit
-model = AutoAdapterModel.from_pretrained(model_name, load_in_4bit=True)
 # Load the adapter
-model.load_adapter("Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps")
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     # Combine system message and chat history
@@ -22,16 +35,28 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
         chat_history += f"User: {user_msg}\nAssistant: {bot_reply}\n"
     chat_history += f"User: {message}\nAssistant:"
-    # Tokenize the input
-    inputs = tokenizer(chat_history, return_tensors="pt", truncation=True).to("cuda")
-    # Generate response with reduced max tokens if necessary
     outputs = model.generate(
-        inputs["input_ids"],
-        max_new_tokens=max_tokens,  # Consider setting a lower default
         temperature=temperature,
         top_p=top_p,
-        pad_token_id=tokenizer.eos_token_id
     )
     # Decode and format the output

 import gradio as gr
 import torch
+from unsloth import FastLanguageModel
+from transformers import TextStreamer
+# Configuration Variables
+model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"  # Replace with your actual model name
+lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"
+max_seq_length = 512  # Adjust as needed
+dtype = None   # Example dtype, adjust based on your setup
+load_in_4bit = True     # Set to True if you want to use 4-bit quantization
+# Load the model and tokenizer using FastLanguageModel
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=model_name,
+    max_seq_length=max_seq_length,
+    dtype=dtype,
+    load_in_4bit=load_in_4bit,
+)
 # Load the adapter
+model.load_adapter(lora_adapter)
+# Enable native 2x faster inference
+FastLanguageModel.for_inference(model)
+# Optional: Initialize TextStreamer if you plan to use streaming
+# text_streamer = TextStreamer(tokenizer, skip_prompt=True)
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     # Combine system message and chat history
         chat_history += f"User: {user_msg}\nAssistant: {bot_reply}\n"
     chat_history += f"User: {message}\nAssistant:"
+    # Apply chat template and tokenize the input
+    inputs = tokenizer.apply_chat_template(
+        [{"role": "user", "content": message}] if not history else [
+            {"role": "system", "content": system_message}] + [
+            {"role": "user", "content": msg} for msg, _ in history
+        ] + [{"role": "assistant", "content": reply} for _, reply in history] + [
+            {"role": "user", "content": message}
+        ],
+        tokenize=True,
+        add_generation_prompt=True,  # Must add for generation
+        return_tensors="pt",
+    ).to("cuda")
+    # Generate response
     outputs = model.generate(
+        input_ids=inputs["input_ids"],
+        max_new_tokens=max_tokens,
         temperature=temperature,
         top_p=top_p,
+        pad_token_id=tokenizer.eos_token_id,
+        use_cache=True
+        # streamer=text_streamer  # Uncomment if using streaming
     )
     # Decode and format the output