Braszczynski commited on
Commit
ed36972
1 Parent(s): 8700197

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -11
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoTokenizer, AutoAdapterModel, TextStreamer
 
4
 
5
  # Configuration Variables
6
  model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" # Replace with your actual model name
@@ -10,17 +11,14 @@ max_seq_length = 512 # Adjust as needed
10
  dtype = None # Example dtype, adjust based on your setup
11
  load_in_4bit = True # Set to True if you want to use 4-bit quantization
12
 
13
- # Dynamically select device
14
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
- print(f"Using device: {device}")
16
 
17
- # Conditional import based on GPU availability
18
- if device.type == "cuda":
19
- from unsloth import FastLanguageModel
20
- model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True).to(device)
21
- model.load_adapter(lora_adapter)
22
- else:
23
- raise RuntimeError("No CUDA GPU available. Please ensure your Space has GPU enabled.")
24
 
25
  def respond(message, history, system_message, max_tokens, temperature, top_p):
26
  # Combine system message and chat history
@@ -35,7 +33,7 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
35
  return_tensors="pt",
36
  truncation=True,
37
  max_length=max_seq_length,
38
- ).to(device)
39
 
40
  # Generate the response
41
  with torch.no_grad():
 
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoTokenizer, AutoAdapterModel, TextStreamer
4
+ from unsloth import FastLanguageModel
5
 
6
  # Configuration Variables
7
  model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" # Replace with your actual model name
 
11
  dtype = None # Example dtype, adjust based on your setup
12
  load_in_4bit = True # Set to True if you want to use 4-bit quantization
13
 
 
 
 
14
 
15
+ model, tokenizer = FastLanguageModel.from_pretrained(
16
+ model_name = lora_adapter,
17
+ max_seq_length = max_seq_length,
18
+ dtype = dtype,
19
+ load_in_4bit = load_in_4bit,
20
+ )
21
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
22
 
23
  def respond(message, history, system_message, max_tokens, temperature, top_p):
24
  # Combine system message and chat history
 
33
  return_tensors="pt",
34
  truncation=True,
35
  max_length=max_seq_length,
36
+ ).to("cuda")
37
 
38
  # Generate the response
39
  with torch.no_grad():