Spaces:
Runtime error
Runtime error
File size: 1,518 Bytes
4715b88 aef46fe 582704c a3b0d76 1fddc84 a3b0d76 582704c aef46fe a3b0d76 aef46fe 1fddc84 a3b0d76 1fddc84 a3b0d76 aef46fe a3b0d76 1fddc84 582704c 1fddc84 a3b0d76 582704c 1fddc84 1fdf5d1 aef46fe c8846c6 1fddc84 582704c a3b0d76 c8846c6 d007172 c8846c6 582704c aef46fe 582704c 1fddc84 582704c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import gradio as gr
# Step 1: Load base model
base_model_name = "meta-llama/Llama-3.3-70B-Instruct"
adapter_repo = "daresearch/Llama-3.3-70B-ft-exec-roles"
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
device_map="auto",
torch_dtype=torch.float16,
)
# Step 2: Load LoRA adapter
model_with_adapter = PeftModel.from_pretrained(
base_model,
adapter_repo,
device_map="auto",
)
print(f"Loaded LoRA adapter from {adapter_repo}")
# Verify adapter configuration
print(model_with_adapter.config)
# Step 3: Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# Step 4: Define inference function
def generate_text(prompt, max_length=1024):
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to("cuda")
outputs = model_with_adapter.generate(**inputs, max_length=max_length)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# Step 5: Create Gradio interface
iface = gr.Interface(
fn=generate_text,
inputs=[
gr.Textbox(label="Prompt", placeholder="Enter your text prompt here..."),
gr.Slider(label="Max Length", minimum=50, maximum=1024, step=10, value=256),
],
outputs="text",
title="LLaMA + LoRA Text Generator",
description="Generate text using a LLaMA model with LoRA adapters."
)
# Step 6: Launch Gradio app
if __name__ == "__main__":
iface.launch()
|