daresearch commited on
Commit
aef46fe
·
verified ·
1 Parent(s): 4942613

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -27
app.py CHANGED
@@ -1,35 +1,24 @@
1
  import torch
2
- from transformers import AutoTokenizer, AutoConfig
3
- from unsloth import SlothModel # For quantized base model
4
  from peft import PeftModel
5
  import gradio as gr
6
 
7
  # Step 1: Define the base model and LoRA adapter
8
- base_model_name = "unsloth/Llama-3.3-70B-Instruct" # Replace with the actual Unsloth-supported base model
9
  adapter_repo = "daresearch/Llama-3.3-70B-ft-exec-roles"
10
 
11
- # Step 2: Load the 4-bit quantized base model using Unsloth with RoPE adjustment
12
- # Check and align RoPE scaling for extended context (if needed)
13
- config = AutoConfig.from_pretrained(base_model_name)
14
- config.rope_scaling = {
15
- "type": "linear", # Use "linear" or "dynamic" scaling
16
- "factor": 8.0 # Adjust factor based on adapter's context length (e.g., 4096 * 8 = 32k tokens)
17
- }
18
-
19
- # Load the quantized base model
20
- base_model = SlothModel.from_pretrained(
21
  base_model_name,
22
- load_in_4bit=True, # Enable 4-bit quantization
23
- device_map="auto", # Automatically distribute across devices
24
- torch_dtype=torch.float16, # Use FP16 for efficiency
25
- config=config, # Pass updated configuration
26
  )
27
 
28
  # Step 3: Load the LoRA adapter
29
  model_with_adapter = PeftModel.from_pretrained(
30
  base_model,
31
  adapter_repo,
32
- device_map="auto", # Ensure compatibility across devices
33
  )
34
 
35
  # Step 4: Load the tokenizer
@@ -37,14 +26,8 @@ tokenizer = AutoTokenizer.from_pretrained(base_model_name)
37
 
38
  # Step 5: Define the inference function
39
  def generate_text(prompt, max_length=1024):
40
- # Ensure the input context length does not exceed the model's limit
41
- max_input_length = 1024 # Set maximum allowable context length
42
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_length).to("cuda")
43
-
44
- # Generate output with LoRA-enhanced model
45
  outputs = model_with_adapter.generate(**inputs, max_length=max_length)
46
-
47
- # Decode and return the output
48
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
49
 
50
  # Step 6: Create the Gradio interface
@@ -55,8 +38,8 @@ iface = gr.Interface(
55
  gr.Slider(label="Max Length", minimum=50, maximum=1024, step=10, value=256),
56
  ],
57
  outputs="text",
58
- title="Unsloth + LoRA Text Generator",
59
- description="Generate text using a 4-bit quantized LLaMA model with LoRA adapters. Supports up to 1024 tokens."
60
  )
61
 
62
  # Step 7: Launch the Gradio app
 
1
  import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
 
3
  from peft import PeftModel
4
  import gradio as gr
5
 
6
  # Step 1: Define the base model and LoRA adapter
7
+ base_model_name = "meta-llama/Llama-3.3-70B-Instruct" # Replace with correct model name
8
  adapter_repo = "daresearch/Llama-3.3-70B-ft-exec-roles"
9
 
10
+ # Step 2: Load the base model
11
+ base_model = AutoModelForCausalLM.from_pretrained(
 
 
 
 
 
 
 
 
12
  base_model_name,
13
+ device_map="auto",
14
+ torch_dtype=torch.float16, # Use FP16 precision
 
 
15
  )
16
 
17
  # Step 3: Load the LoRA adapter
18
  model_with_adapter = PeftModel.from_pretrained(
19
  base_model,
20
  adapter_repo,
21
+ device_map="auto",
22
  )
23
 
24
  # Step 4: Load the tokenizer
 
26
 
27
  # Step 5: Define the inference function
28
  def generate_text(prompt, max_length=1024):
29
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to("cuda")
 
 
 
 
30
  outputs = model_with_adapter.generate(**inputs, max_length=max_length)
 
 
31
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
32
 
33
  # Step 6: Create the Gradio interface
 
38
  gr.Slider(label="Max Length", minimum=50, maximum=1024, step=10, value=256),
39
  ],
40
  outputs="text",
41
+ title="LLaMA + LoRA Text Generator",
42
+ description="Generate text using a LLaMA model with LoRA adapters."
43
  )
44
 
45
  # Step 7: Launch the Gradio app