daresearch commited on
Commit
d007172
·
verified ·
1 Parent(s): f9c104e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -9
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import torch
2
- from transformers import AutoTokenizer
3
  from unsloth import SlothModel # For quantized base model
4
  from peft import PeftModel
5
  import gradio as gr
@@ -8,12 +8,21 @@ import gradio as gr
8
  base_model_name = "unsloth/Llama-3.3-70B-Instruct" # Replace with the actual Unsloth-supported base model
9
  adapter_repo = "daresearch/Llama-3.3-70B-ft-exec-roles"
10
 
11
- # Step 2: Load the 4-bit quantized base model using Unsloth
 
 
 
 
 
 
 
 
12
  base_model = SlothModel.from_pretrained(
13
  base_model_name,
14
- load_in_4bit=True, # Enable 4-bit quantization
15
- device_map="auto", # Automatically distribute across devices
16
- torch_dtype=torch.float16, # Use FP16 for efficiency
 
17
  )
18
 
19
  # Step 3: Load the LoRA adapter
@@ -28,8 +37,9 @@ tokenizer = AutoTokenizer.from_pretrained(base_model_name)
28
 
29
  # Step 5: Define the inference function
30
  def generate_text(prompt, max_length=1024):
31
- # Tokenize the input
32
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
 
33
 
34
  # Generate output with LoRA-enhanced model
35
  outputs = model_with_adapter.generate(**inputs, max_length=max_length)
@@ -42,11 +52,11 @@ iface = gr.Interface(
42
  fn=generate_text,
43
  inputs=[
44
  gr.Textbox(label="Prompt", placeholder="Enter your text prompt here..."),
45
- gr.Slider(label="Max Length", minimum=50, maximum=1024, step=10, value=100),
46
  ],
47
  outputs="text",
48
  title="Unsloth + LoRA Text Generator",
49
- description="Generate text using a 4-bit quantized LLaMA model with LoRA adapters."
50
  )
51
 
52
  # Step 7: Launch the Gradio app
 
1
  import torch
2
+ from transformers import AutoTokenizer, AutoConfig
3
  from unsloth import SlothModel # For quantized base model
4
  from peft import PeftModel
5
  import gradio as gr
 
8
  base_model_name = "unsloth/Llama-3.3-70B-Instruct" # Replace with the actual Unsloth-supported base model
9
  adapter_repo = "daresearch/Llama-3.3-70B-ft-exec-roles"
10
 
11
+ # Step 2: Load the 4-bit quantized base model using Unsloth with RoPE adjustment
12
+ # Check and align RoPE scaling for extended context (if needed)
13
+ config = AutoConfig.from_pretrained(base_model_name)
14
+ config.rope_scaling = {
15
+ "type": "linear", # Use "linear" or "dynamic" scaling
16
+ "factor": 8.0 # Adjust factor based on adapter's context length (e.g., 4096 * 8 = 32k tokens)
17
+ }
18
+
19
+ # Load the quantized base model
20
  base_model = SlothModel.from_pretrained(
21
  base_model_name,
22
+ load_in_4bit=True, # Enable 4-bit quantization
23
+ device_map="auto", # Automatically distribute across devices
24
+ torch_dtype=torch.float16, # Use FP16 for efficiency
25
+ config=config, # Pass updated configuration
26
  )
27
 
28
  # Step 3: Load the LoRA adapter
 
37
 
38
  # Step 5: Define the inference function
39
  def generate_text(prompt, max_length=1024):
40
+ # Ensure the input context length does not exceed the model's limit
41
+ max_input_length = 1024 # Set maximum allowable context length
42
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_length).to("cuda")
43
 
44
  # Generate output with LoRA-enhanced model
45
  outputs = model_with_adapter.generate(**inputs, max_length=max_length)
 
52
  fn=generate_text,
53
  inputs=[
54
  gr.Textbox(label="Prompt", placeholder="Enter your text prompt here..."),
55
+ gr.Slider(label="Max Length", minimum=50, maximum=1024, step=10, value=256),
56
  ],
57
  outputs="text",
58
  title="Unsloth + LoRA Text Generator",
59
+ description="Generate text using a 4-bit quantized LLaMA model with LoRA adapters. Supports up to 1024 tokens."
60
  )
61
 
62
  # Step 7: Launch the Gradio app