daresearch commited on
Commit
c8846c6
·
verified ·
1 Parent(s): 4715b88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -26
app.py CHANGED
@@ -1,50 +1,54 @@
1
  import torch
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
3
  from peft import PeftModel
4
  import gradio as gr
5
 
6
- # Step 1: Load the base model
7
- base_model_name = "meta-llama/Llama-3.3-70B-Instruct"
8
  adapter_repo = "daresearch/Llama-3.3-70B-ft-exec-roles"
9
 
10
- # Load the base model (LlamaForCausalLM)
11
- base_model = AutoModelForCausalLM.from_pretrained(
12
  base_model_name,
13
- device_map="auto", # Distribute model across GPUs (if available)
14
- torch_dtype=torch.float16, # Use FP16 precision to save memory
 
15
  )
16
 
17
- # Step 2: Load the LoRA adapter into the base model
18
  model_with_adapter = PeftModel.from_pretrained(
19
  base_model,
20
  adapter_repo,
21
- device_map="auto",
22
  )
23
 
24
- # Step 3: Extract the underlying base model from the LoRA wrapper
25
- underlying_model = model_with_adapter.merge_and_unload() # Merges LoRA weights into base model
26
-
27
  # Step 4: Load the tokenizer
28
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
29
 
30
- # Step 5: Create the text generation pipeline using the underlying base model
31
- pipe = pipeline("text-generation", model=underlying_model, tokenizer=tokenizer)
32
-
33
- # Define the Gradio interface function
34
- def generate_text(prompt):
35
- # Use the pipeline to generate text
36
- outputs = pipe(prompt, max_length=200)
37
- return outputs[0]["generated_text"]
38
-
39
- # Create the Gradio interface
 
 
40
  iface = gr.Interface(
41
  fn=generate_text,
42
- inputs="text",
 
 
 
43
  outputs="text",
44
- title="LoRA-Enhanced LLaMA Text Generator",
45
- description="Provide a prompt, and the model will generate a response."
46
  )
47
 
48
- # Launch the app
49
  if __name__ == "__main__":
50
  iface.launch()
 
1
  import torch
2
+ from transformers import AutoTokenizer
3
+ from unsloth import SlothModel # For quantized base model
4
  from peft import PeftModel
5
  import gradio as gr
6
 
7
+ # Step 1: Define the base model and LoRA adapter
8
+ base_model_name = "meta-llama/Llama-3.3-70B-Instruct" # Replace with the actual Unsloth-supported base model
9
  adapter_repo = "daresearch/Llama-3.3-70B-ft-exec-roles"
10
 
11
+ # Step 2: Load the 4-bit quantized base model using Unsloth
12
+ base_model = SlothModel.from_pretrained(
13
  base_model_name,
14
+ load_in_4bit=True, # Enable 4-bit quantization
15
+ device_map="auto", # Automatically distribute across devices
16
+ torch_dtype=torch.float16, # Use FP16 for efficiency
17
  )
18
 
19
+ # Step 3: Load the LoRA adapter
20
  model_with_adapter = PeftModel.from_pretrained(
21
  base_model,
22
  adapter_repo,
23
+ device_map="auto", # Ensure compatibility across devices
24
  )
25
 
 
 
 
26
  # Step 4: Load the tokenizer
27
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
28
 
29
+ # Step 5: Define the inference function
30
+ def generate_text(prompt, max_length=100):
31
+ # Tokenize the input
32
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
33
+
34
+ # Generate output with LoRA-enhanced model
35
+ outputs = model_with_adapter.generate(**inputs, max_length=max_length)
36
+
37
+ # Decode and return the output
38
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
39
+
40
+ # Step 6: Create the Gradio interface
41
  iface = gr.Interface(
42
  fn=generate_text,
43
+ inputs=[
44
+ gr.Textbox(label="Prompt", placeholder="Enter your text prompt here..."),
45
+ gr.Slider(label="Max Length", minimum=50, maximum=500, step=10, value=100),
46
+ ],
47
  outputs="text",
48
+ title="Unsloth + LoRA Text Generator",
49
+ description="Generate text using a 4-bit quantized LLaMA model with LoRA adapters."
50
  )
51
 
52
+ # Step 7: Launch the Gradio app
53
  if __name__ == "__main__":
54
  iface.launch()