Spaces:

daresearch
/

llama-70-merge-space

Runtime error

App Files Files Community

daresearch commited on Dec 25, 2024

Commit

c8846c6

verified ·

1 Parent(s): 4715b88

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -26

app.py CHANGED Viewed

@@ -1,50 +1,54 @@
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from peft import PeftModel
 import gradio as gr
-# Step 1: Load the base model
-base_model_name = "meta-llama/Llama-3.3-70B-Instruct"
 adapter_repo = "daresearch/Llama-3.3-70B-ft-exec-roles"
-# Load the base model (LlamaForCausalLM)
-base_model = AutoModelForCausalLM.from_pretrained(
     base_model_name,
-    device_map="auto",           # Distribute model across GPUs (if available)
-    torch_dtype=torch.float16,   # Use FP16 precision to save memory
 )
-# Step 2: Load the LoRA adapter into the base model
 model_with_adapter = PeftModel.from_pretrained(
     base_model,
     adapter_repo,
-    device_map="auto",
 )
-# Step 3: Extract the underlying base model from the LoRA wrapper
-underlying_model = model_with_adapter.merge_and_unload()  # Merges LoRA weights into base model
 # Step 4: Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-# Step 5: Create the text generation pipeline using the underlying base model
-pipe = pipeline("text-generation", model=underlying_model, tokenizer=tokenizer)
-# Define the Gradio interface function
-def generate_text(prompt):
-    # Use the pipeline to generate text
-    outputs = pipe(prompt, max_length=200)
-    return outputs[0]["generated_text"]
-# Create the Gradio interface
 iface = gr.Interface(
     fn=generate_text,
-    inputs="text",
     outputs="text",
-    title="LoRA-Enhanced LLaMA Text Generator",
-    description="Provide a prompt, and the model will generate a response."
 )
-# Launch the app
 if __name__ == "__main__":
     iface.launch()

 import torch
+from transformers import AutoTokenizer
+from unsloth import SlothModel  # For quantized base model
 from peft import PeftModel
 import gradio as gr
+# Step 1: Define the base model and LoRA adapter
+base_model_name = "meta-llama/Llama-3.3-70B-Instruct"  # Replace with the actual Unsloth-supported base model
 adapter_repo = "daresearch/Llama-3.3-70B-ft-exec-roles"
+# Step 2: Load the 4-bit quantized base model using Unsloth
+base_model = SlothModel.from_pretrained(
     base_model_name,
+    load_in_4bit=True,       # Enable 4-bit quantization
+    device_map="auto",       # Automatically distribute across devices
+    torch_dtype=torch.float16,  # Use FP16 for efficiency
 )
+# Step 3: Load the LoRA adapter
 model_with_adapter = PeftModel.from_pretrained(
     base_model,
     adapter_repo,
+    device_map="auto",        # Ensure compatibility across devices
 )
 # Step 4: Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+# Step 5: Define the inference function
+def generate_text(prompt, max_length=100):
+    # Tokenize the input
+    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+    # Generate output with LoRA-enhanced model
+    outputs = model_with_adapter.generate(**inputs, max_length=max_length)
+    # Decode and return the output
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+# Step 6: Create the Gradio interface
 iface = gr.Interface(
     fn=generate_text,
+    inputs=[
+        gr.Textbox(label="Prompt", placeholder="Enter your text prompt here..."),
+        gr.Slider(label="Max Length", minimum=50, maximum=500, step=10, value=100),
+    ],
     outputs="text",
+    title="Unsloth + LoRA Text Generator",
+    description="Generate text using a 4-bit quantized LLaMA model with LoRA adapters."
 )
+# Step 7: Launch the Gradio app
 if __name__ == "__main__":
     iface.launch()