colonelwatch commited on
Commit
c574006
·
1 Parent(s): fd084a4

Compile the model for lower latency

Browse files
Files changed (1) hide show
  1. app.py +2 -0
app.py CHANGED
@@ -219,11 +219,13 @@ def main():
219
 
220
  model = get_model(model_name, trust_remote_code)
221
  index = get_index(dir, search_time_s)
 
222
  if torch.cuda.is_available():
223
  model = model.half().cuda() if fp16 else model.bfloat16().cuda()
224
  # TODO: if huggingface datasets exposes an fp16 gpu option, use it here
225
  elif fp16:
226
  print('warning: used "FP16" on CPU-only system, ignoring...', file=stderr)
 
227
 
228
  # function signature: (expanded tuple of input batches) -> tuple of output batches
229
  def search(query: list[str]) -> tuple[list[str]]:
 
219
 
220
  model = get_model(model_name, trust_remote_code)
221
  index = get_index(dir, search_time_s)
222
+
223
  if torch.cuda.is_available():
224
  model = model.half().cuda() if fp16 else model.bfloat16().cuda()
225
  # TODO: if huggingface datasets exposes an fp16 gpu option, use it here
226
  elif fp16:
227
  print('warning: used "FP16" on CPU-only system, ignoring...', file=stderr)
228
+ model.compile(mode="reduce-overhead")
229
 
230
  # function signature: (expanded tuple of input batches) -> tuple of output batches
231
  def search(query: list[str]) -> tuple[list[str]]: