mike23415 commited on
Commit
ffab90e
·
verified ·
1 Parent(s): b097938

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -8
app.py CHANGED
@@ -12,8 +12,8 @@ warnings.filterwarnings("ignore")
12
  logging.set_verbosity_error()
13
 
14
  # Global variables
15
- # Updated to use a model that's actually available on Hugging Face
16
- MODEL_ID = "microsoft/phi-2" # Alternative: "microsoft/phi-1_5" or any other available model
17
  MAX_LENGTH = 2048
18
  MAX_NEW_TOKENS = 512
19
  TEMPERATURE = 0.7
@@ -41,12 +41,11 @@ def load_model_and_tokenizer():
41
  trust_remote_code=True
42
  )
43
 
44
- # Load model with optimizations for limited resources
45
  model = AutoModelForCausalLM.from_pretrained(
46
  MODEL_ID,
47
- device_map="auto",
48
- torch_dtype=torch.bfloat16,
49
- load_in_4bit=True,
50
  trust_remote_code=True
51
  )
52
 
@@ -73,7 +72,7 @@ def generate_with_thinking(prompt, thinking_steps=THINKING_STEPS):
73
  thinking_output = ""
74
  for step in range(thinking_steps):
75
  # Generate step i of thinking
76
- inputs = tokenizer(thinking_prompt + thinking_output, return_tensors="pt").to(model.device)
77
 
78
  with torch.no_grad():
79
  outputs = model.generate(
@@ -96,7 +95,7 @@ def generate_with_thinking(prompt, thinking_steps=THINKING_STEPS):
96
  # Now generate final answer based on the thinking
97
  final_prompt = full_prompt + "\n\n" + thinking_output + "\n\nBased on this thinking, my final answer is:"
98
 
99
- inputs = tokenizer(final_prompt, return_tensors="pt").to(model.device)
100
  with torch.no_grad():
101
  outputs = model.generate(
102
  inputs["input_ids"],
 
12
  logging.set_verbosity_error()
13
 
14
  # Global variables
15
+ # Using phi-2 which is a smaller model that can run on CPU
16
+ MODEL_ID = "microsoft/phi-2"
17
  MAX_LENGTH = 2048
18
  MAX_NEW_TOKENS = 512
19
  TEMPERATURE = 0.7
 
41
  trust_remote_code=True
42
  )
43
 
44
+ # Load model with CPU optimizations - removed 4-bit quantization
45
  model = AutoModelForCausalLM.from_pretrained(
46
  MODEL_ID,
47
+ low_cpu_mem_usage=True, # Optimize for CPU usage
48
+ torch_dtype=torch.float32, # Use float32 instead of bfloat16 for better CPU compatibility
 
49
  trust_remote_code=True
50
  )
51
 
 
72
  thinking_output = ""
73
  for step in range(thinking_steps):
74
  # Generate step i of thinking
75
+ inputs = tokenizer(thinking_prompt + thinking_output, return_tensors="pt")
76
 
77
  with torch.no_grad():
78
  outputs = model.generate(
 
95
  # Now generate final answer based on the thinking
96
  final_prompt = full_prompt + "\n\n" + thinking_output + "\n\nBased on this thinking, my final answer is:"
97
 
98
+ inputs = tokenizer(final_prompt, return_tensors="pt")
99
  with torch.no_grad():
100
  outputs = model.generate(
101
  inputs["input_ids"],