Update app.py
Browse files
app.py
CHANGED
@@ -12,8 +12,8 @@ warnings.filterwarnings("ignore")
|
|
12 |
logging.set_verbosity_error()
|
13 |
|
14 |
# Global variables
|
15 |
-
#
|
16 |
-
MODEL_ID = "microsoft/phi-2"
|
17 |
MAX_LENGTH = 2048
|
18 |
MAX_NEW_TOKENS = 512
|
19 |
TEMPERATURE = 0.7
|
@@ -41,12 +41,11 @@ def load_model_and_tokenizer():
|
|
41 |
trust_remote_code=True
|
42 |
)
|
43 |
|
44 |
-
# Load model with optimizations
|
45 |
model = AutoModelForCausalLM.from_pretrained(
|
46 |
MODEL_ID,
|
47 |
-
|
48 |
-
torch_dtype=torch.bfloat16
|
49 |
-
load_in_4bit=True,
|
50 |
trust_remote_code=True
|
51 |
)
|
52 |
|
@@ -73,7 +72,7 @@ def generate_with_thinking(prompt, thinking_steps=THINKING_STEPS):
|
|
73 |
thinking_output = ""
|
74 |
for step in range(thinking_steps):
|
75 |
# Generate step i of thinking
|
76 |
-
inputs = tokenizer(thinking_prompt + thinking_output, return_tensors="pt")
|
77 |
|
78 |
with torch.no_grad():
|
79 |
outputs = model.generate(
|
@@ -96,7 +95,7 @@ def generate_with_thinking(prompt, thinking_steps=THINKING_STEPS):
|
|
96 |
# Now generate final answer based on the thinking
|
97 |
final_prompt = full_prompt + "\n\n" + thinking_output + "\n\nBased on this thinking, my final answer is:"
|
98 |
|
99 |
-
inputs = tokenizer(final_prompt, return_tensors="pt")
|
100 |
with torch.no_grad():
|
101 |
outputs = model.generate(
|
102 |
inputs["input_ids"],
|
|
|
12 |
logging.set_verbosity_error()
|
13 |
|
14 |
# Global variables
|
15 |
+
# Using phi-2 which is a smaller model that can run on CPU
|
16 |
+
MODEL_ID = "microsoft/phi-2"
|
17 |
MAX_LENGTH = 2048
|
18 |
MAX_NEW_TOKENS = 512
|
19 |
TEMPERATURE = 0.7
|
|
|
41 |
trust_remote_code=True
|
42 |
)
|
43 |
|
44 |
+
# Load model with CPU optimizations - removed 4-bit quantization
|
45 |
model = AutoModelForCausalLM.from_pretrained(
|
46 |
MODEL_ID,
|
47 |
+
low_cpu_mem_usage=True, # Optimize for CPU usage
|
48 |
+
torch_dtype=torch.float32, # Use float32 instead of bfloat16 for better CPU compatibility
|
|
|
49 |
trust_remote_code=True
|
50 |
)
|
51 |
|
|
|
72 |
thinking_output = ""
|
73 |
for step in range(thinking_steps):
|
74 |
# Generate step i of thinking
|
75 |
+
inputs = tokenizer(thinking_prompt + thinking_output, return_tensors="pt")
|
76 |
|
77 |
with torch.no_grad():
|
78 |
outputs = model.generate(
|
|
|
95 |
# Now generate final answer based on the thinking
|
96 |
final_prompt = full_prompt + "\n\n" + thinking_output + "\n\nBased on this thinking, my final answer is:"
|
97 |
|
98 |
+
inputs = tokenizer(final_prompt, return_tensors="pt")
|
99 |
with torch.no_grad():
|
100 |
outputs = model.generate(
|
101 |
inputs["input_ids"],
|