desert commited on
Commit
d67d04a
·
1 Parent(s): 21886ee
Files changed (1) hide show
  1. app.py +13 -8
app.py CHANGED
@@ -2,17 +2,22 @@ import gradio as gr
2
  from unsloth import FastLanguageModel
3
  import torch
4
 
5
- max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
6
- dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
7
- load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
 
 
 
8
 
9
  model, tokenizer = FastLanguageModel.from_pretrained(
10
- model_name = "llama_lora_model_1",
11
- max_seq_length = max_seq_length,
12
- dtype = dtype,
13
- load_in_4bit = load_in_4bit,
14
  )
15
 
 
 
16
  # Respond function
17
  def respond(
18
  message,
@@ -45,7 +50,7 @@ def respond(
45
 
46
  # Generate the response using your model
47
  outputs = model.generate(
48
- input_ids=inputs["input_ids"],
49
  max_new_tokens=max_tokens,
50
  temperature=temperature,
51
  top_p=top_p,
 
2
  from unsloth import FastLanguageModel
3
  import torch
4
 
5
+ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
6
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
7
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
8
+
9
+ # Check for GPU availability and use the appropriate device
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
 
12
  model, tokenizer = FastLanguageModel.from_pretrained(
13
+ model_name="llama_lora_model_1",
14
+ max_seq_length=max_seq_length,
15
+ dtype=dtype,
16
+ load_in_4bit=load_in_4bit,
17
  )
18
 
19
+ model.to(device) # Move model to the appropriate device
20
+
21
  # Respond function
22
  def respond(
23
  message,
 
50
 
51
  # Generate the response using your model
52
  outputs = model.generate(
53
+ input_ids=inputs["input_ids"].to(device), # Ensure input is on the correct device
54
  max_new_tokens=max_tokens,
55
  temperature=temperature,
56
  top_p=top_p,