Spaces:

mike23415
/

Thinking

Running

App Files Files Community

mike23415 commited on 28 days ago

Commit

ffab90e

verified ·

1 Parent(s): b097938

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -8

app.py CHANGED Viewed

@@ -12,8 +12,8 @@ warnings.filterwarnings("ignore")
 logging.set_verbosity_error()
 # Global variables
-# Updated to use a model that's actually available on Hugging Face
-MODEL_ID = "microsoft/phi-2"  # Alternative: "microsoft/phi-1_5" or any other available model
 MAX_LENGTH = 2048
 MAX_NEW_TOKENS = 512
 TEMPERATURE = 0.7
@@ -41,12 +41,11 @@ def load_model_and_tokenizer():
             trust_remote_code=True
         )
-        # Load model with optimizations for limited resources
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
-            device_map="auto",
-            torch_dtype=torch.bfloat16,
-            load_in_4bit=True,
             trust_remote_code=True
         )
@@ -73,7 +72,7 @@ def generate_with_thinking(prompt, thinking_steps=THINKING_STEPS):
     thinking_output = ""
     for step in range(thinking_steps):
         # Generate step i of thinking
-        inputs = tokenizer(thinking_prompt + thinking_output, return_tensors="pt").to(model.device)
         with torch.no_grad():
             outputs = model.generate(
@@ -96,7 +95,7 @@ def generate_with_thinking(prompt, thinking_steps=THINKING_STEPS):
     # Now generate final answer based on the thinking
     final_prompt = full_prompt + "\n\n" + thinking_output + "\n\nBased on this thinking, my final answer is:"
-    inputs = tokenizer(final_prompt, return_tensors="pt").to(model.device)
     with torch.no_grad():
         outputs = model.generate(
             inputs["input_ids"],

 logging.set_verbosity_error()
 # Global variables
+# Using phi-2 which is a smaller model that can run on CPU
+MODEL_ID = "microsoft/phi-2"
 MAX_LENGTH = 2048
 MAX_NEW_TOKENS = 512
 TEMPERATURE = 0.7
             trust_remote_code=True
         )
+        # Load model with CPU optimizations - removed 4-bit quantization
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
+            low_cpu_mem_usage=True,       # Optimize for CPU usage
+            torch_dtype=torch.float32,    # Use float32 instead of bfloat16 for better CPU compatibility
             trust_remote_code=True
         )
     thinking_output = ""
     for step in range(thinking_steps):
         # Generate step i of thinking
+        inputs = tokenizer(thinking_prompt + thinking_output, return_tensors="pt")
         with torch.no_grad():
             outputs = model.generate(
     # Now generate final answer based on the thinking
     final_prompt = full_prompt + "\n\n" + thinking_output + "\n\nBased on this thinking, my final answer is:"
+    inputs = tokenizer(final_prompt, return_tensors="pt")
     with torch.no_grad():
         outputs = model.generate(
             inputs["input_ids"],