desert
commited on
Commit
·
5ccb54c
1
Parent(s):
d67d04a
del
Browse files
app.py
CHANGED
@@ -6,9 +6,10 @@ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
|
|
6 |
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
7 |
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
|
8 |
|
9 |
-
#
|
10 |
-
device = "
|
11 |
|
|
|
12 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
13 |
model_name="llama_lora_model_1",
|
14 |
max_seq_length=max_seq_length,
|
@@ -16,7 +17,8 @@ model, tokenizer = FastLanguageModel.from_pretrained(
|
|
16 |
load_in_4bit=load_in_4bit,
|
17 |
)
|
18 |
|
19 |
-
|
|
|
20 |
|
21 |
# Respond function
|
22 |
def respond(
|
@@ -48,9 +50,9 @@ def respond(
|
|
48 |
return_tensors="pt",
|
49 |
)
|
50 |
|
51 |
-
# Generate the response using your model
|
52 |
outputs = model.generate(
|
53 |
-
input_ids=inputs["input_ids"].to(device), # Ensure input is on the
|
54 |
max_new_tokens=max_tokens,
|
55 |
temperature=temperature,
|
56 |
top_p=top_p,
|
|
|
6 |
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
7 |
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
|
8 |
|
9 |
+
# Force the model to run on CPU only by setting the device to "cpu"
|
10 |
+
device = "cpu"
|
11 |
|
12 |
+
# Load model and tokenizer with the device set to "cpu"
|
13 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
14 |
model_name="llama_lora_model_1",
|
15 |
max_seq_length=max_seq_length,
|
|
|
17 |
load_in_4bit=load_in_4bit,
|
18 |
)
|
19 |
|
20 |
+
# Move the model to CPU (even if it was initially loaded with GPU support)
|
21 |
+
model.to(device)
|
22 |
|
23 |
# Respond function
|
24 |
def respond(
|
|
|
50 |
return_tensors="pt",
|
51 |
)
|
52 |
|
53 |
+
# Generate the response using your model on CPU
|
54 |
outputs = model.generate(
|
55 |
+
input_ids=inputs["input_ids"].to(device), # Ensure input is on the CPU
|
56 |
max_new_tokens=max_tokens,
|
57 |
temperature=temperature,
|
58 |
top_p=top_p,
|