Spaces:

daresearch
/

llama-70-merge-space

Runtime error

App Files Files Community

daresearch commited on Dec 25, 2024

Commit

d007172

verified ·

1 Parent(s): f9c104e

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -9

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-from transformers import AutoTokenizer
 from unsloth import SlothModel  # For quantized base model
 from peft import PeftModel
 import gradio as gr
@@ -8,12 +8,21 @@ import gradio as gr
 base_model_name = "unsloth/Llama-3.3-70B-Instruct"  # Replace with the actual Unsloth-supported base model
 adapter_repo = "daresearch/Llama-3.3-70B-ft-exec-roles"
-# Step 2: Load the 4-bit quantized base model using Unsloth
 base_model = SlothModel.from_pretrained(
     base_model_name,
-    load_in_4bit=True,       # Enable 4-bit quantization
-    device_map="auto",       # Automatically distribute across devices
-    torch_dtype=torch.float16,  # Use FP16 for efficiency
 )
 # Step 3: Load the LoRA adapter
@@ -28,8 +37,9 @@ tokenizer = AutoTokenizer.from_pretrained(base_model_name)
 # Step 5: Define the inference function
 def generate_text(prompt, max_length=1024):
-    # Tokenize the input
-    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
     # Generate output with LoRA-enhanced model
     outputs = model_with_adapter.generate(**inputs, max_length=max_length)
@@ -42,11 +52,11 @@ iface = gr.Interface(
     fn=generate_text,
     inputs=[
         gr.Textbox(label="Prompt", placeholder="Enter your text prompt here..."),
-        gr.Slider(label="Max Length", minimum=50, maximum=1024, step=10, value=100),
     ],
     outputs="text",
     title="Unsloth + LoRA Text Generator",
-    description="Generate text using a 4-bit quantized LLaMA model with LoRA adapters."
 )
 # Step 7: Launch the Gradio app

 import torch
+from transformers import AutoTokenizer, AutoConfig
 from unsloth import SlothModel  # For quantized base model
 from peft import PeftModel
 import gradio as gr
 base_model_name = "unsloth/Llama-3.3-70B-Instruct"  # Replace with the actual Unsloth-supported base model
 adapter_repo = "daresearch/Llama-3.3-70B-ft-exec-roles"
+# Step 2: Load the 4-bit quantized base model using Unsloth with RoPE adjustment
+# Check and align RoPE scaling for extended context (if needed)
+config = AutoConfig.from_pretrained(base_model_name)
+config.rope_scaling = {
+    "type": "linear",  # Use "linear" or "dynamic" scaling
+    "factor": 8.0      # Adjust factor based on adapter's context length (e.g., 4096 * 8 = 32k tokens)
+}
+# Load the quantized base model
 base_model = SlothModel.from_pretrained(
     base_model_name,
+    load_in_4bit=True,              # Enable 4-bit quantization
+    device_map="auto",              # Automatically distribute across devices
+    torch_dtype=torch.float16,      # Use FP16 for efficiency
+    config=config,                  # Pass updated configuration
 )
 # Step 3: Load the LoRA adapter
 # Step 5: Define the inference function
 def generate_text(prompt, max_length=1024):
+    # Ensure the input context length does not exceed the model's limit
+    max_input_length = 1024  # Set maximum allowable context length
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_length).to("cuda")
     # Generate output with LoRA-enhanced model
     outputs = model_with_adapter.generate(**inputs, max_length=max_length)
     fn=generate_text,
     inputs=[
         gr.Textbox(label="Prompt", placeholder="Enter your text prompt here..."),
+        gr.Slider(label="Max Length", minimum=50, maximum=1024, step=10, value=256),
     ],
     outputs="text",
     title="Unsloth + LoRA Text Generator",
+    description="Generate text using a 4-bit quantized LLaMA model with LoRA adapters. Supports up to 1024 tokens."
 )
 # Step 7: Launch the Gradio app