desert commited on
Commit
5ccb54c
·
1 Parent(s): d67d04a
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -6,9 +6,10 @@ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
6
  dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
7
  load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
8
 
9
- # Check for GPU availability and use the appropriate device
10
- device = "cuda" if torch.cuda.is_available() else "cpu"
11
 
 
12
  model, tokenizer = FastLanguageModel.from_pretrained(
13
  model_name="llama_lora_model_1",
14
  max_seq_length=max_seq_length,
@@ -16,7 +17,8 @@ model, tokenizer = FastLanguageModel.from_pretrained(
16
  load_in_4bit=load_in_4bit,
17
  )
18
 
19
- model.to(device) # Move model to the appropriate device
 
20
 
21
  # Respond function
22
  def respond(
@@ -48,9 +50,9 @@ def respond(
48
  return_tensors="pt",
49
  )
50
 
51
- # Generate the response using your model
52
  outputs = model.generate(
53
- input_ids=inputs["input_ids"].to(device), # Ensure input is on the correct device
54
  max_new_tokens=max_tokens,
55
  temperature=temperature,
56
  top_p=top_p,
 
6
  dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
7
  load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
8
 
9
+ # Force the model to run on CPU only by setting the device to "cpu"
10
+ device = "cpu"
11
 
12
+ # Load model and tokenizer with the device set to "cpu"
13
  model, tokenizer = FastLanguageModel.from_pretrained(
14
  model_name="llama_lora_model_1",
15
  max_seq_length=max_seq_length,
 
17
  load_in_4bit=load_in_4bit,
18
  )
19
 
20
+ # Move the model to CPU (even if it was initially loaded with GPU support)
21
+ model.to(device)
22
 
23
  # Respond function
24
  def respond(
 
50
  return_tensors="pt",
51
  )
52
 
53
+ # Generate the response using your model on CPU
54
  outputs = model.generate(
55
+ input_ids=inputs["input_ids"].to(device), # Ensure input is on the CPU
56
  max_new_tokens=max_tokens,
57
  temperature=temperature,
58
  top_p=top_p,