Spaces:

daresearch
/

llama-70-merge-space

Runtime error

App Files Files Community

daresearch commited on Dec 25, 2024

Commit

aef46fe

verified ·

1 Parent(s): 4942613

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -27

app.py CHANGED Viewed

@@ -1,35 +1,24 @@
 import torch
-from transformers import AutoTokenizer, AutoConfig
-from unsloth import SlothModel  # For quantized base model
 from peft import PeftModel
 import gradio as gr
 # Step 1: Define the base model and LoRA adapter
-base_model_name = "unsloth/Llama-3.3-70B-Instruct"  # Replace with the actual Unsloth-supported base model
 adapter_repo = "daresearch/Llama-3.3-70B-ft-exec-roles"
-# Step 2: Load the 4-bit quantized base model using Unsloth with RoPE adjustment
-# Check and align RoPE scaling for extended context (if needed)
-config = AutoConfig.from_pretrained(base_model_name)
-config.rope_scaling = {
-    "type": "linear",  # Use "linear" or "dynamic" scaling
-    "factor": 8.0      # Adjust factor based on adapter's context length (e.g., 4096 * 8 = 32k tokens)
-}
-# Load the quantized base model
-base_model = SlothModel.from_pretrained(
     base_model_name,
-    load_in_4bit=True,              # Enable 4-bit quantization
-    device_map="auto",              # Automatically distribute across devices
-    torch_dtype=torch.float16,      # Use FP16 for efficiency
-    config=config,                  # Pass updated configuration
 )
 # Step 3: Load the LoRA adapter
 model_with_adapter = PeftModel.from_pretrained(
     base_model,
     adapter_repo,
-    device_map="auto",        # Ensure compatibility across devices
 )
 # Step 4: Load the tokenizer
@@ -37,14 +26,8 @@ tokenizer = AutoTokenizer.from_pretrained(base_model_name)
 # Step 5: Define the inference function
 def generate_text(prompt, max_length=1024):
-    # Ensure the input context length does not exceed the model's limit
-    max_input_length = 1024  # Set maximum allowable context length
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_length).to("cuda")
-    # Generate output with LoRA-enhanced model
     outputs = model_with_adapter.generate(**inputs, max_length=max_length)
-    # Decode and return the output
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
 # Step 6: Create the Gradio interface
@@ -55,8 +38,8 @@ iface = gr.Interface(
         gr.Slider(label="Max Length", minimum=50, maximum=1024, step=10, value=256),
     ],
     outputs="text",
-    title="Unsloth + LoRA Text Generator",
-    description="Generate text using a 4-bit quantized LLaMA model with LoRA adapters. Supports up to 1024 tokens."
 )
 # Step 7: Launch the Gradio app

 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 import gradio as gr
 # Step 1: Define the base model and LoRA adapter
+base_model_name = "meta-llama/Llama-3.3-70B-Instruct"  # Replace with correct model name
 adapter_repo = "daresearch/Llama-3.3-70B-ft-exec-roles"
+# Step 2: Load the base model
+base_model = AutoModelForCausalLM.from_pretrained(
     base_model_name,
+    device_map="auto",
+    torch_dtype=torch.float16,  # Use FP16 precision
 )
 # Step 3: Load the LoRA adapter
 model_with_adapter = PeftModel.from_pretrained(
     base_model,
     adapter_repo,
+    device_map="auto",
 )
 # Step 4: Load the tokenizer
 # Step 5: Define the inference function
 def generate_text(prompt, max_length=1024):
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to("cuda")
     outputs = model_with_adapter.generate(**inputs, max_length=max_length)
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
 # Step 6: Create the Gradio interface
         gr.Slider(label="Max Length", minimum=50, maximum=1024, step=10, value=256),
     ],
     outputs="text",
+    title="LLaMA + LoRA Text Generator",
+    description="Generate text using a LLaMA model with LoRA adapters."
 )
 # Step 7: Launch the Gradio app