Spaces:

colonelwatch
/

abstracts-index

Running on Zero

colonelwatch commited on Nov 17, 2024

Commit

c574006

1 Parent(s): fd084a4

Compile the model for lower latency

Files changed (1) hide show

app.py CHANGED Viewed

@@ -219,11 +219,13 @@ def main():
     model = get_model(model_name, trust_remote_code)
     index = get_index(dir, search_time_s)
     if torch.cuda.is_available():
         model = model.half().cuda() if fp16 else model.bfloat16().cuda()
         # TODO: if huggingface datasets exposes an fp16 gpu option, use it here
     elif fp16:
         print('warning: used "FP16" on CPU-only system, ignoring...', file=stderr)
     # function signature: (expanded tuple of input batches) -> tuple of output batches
     def search(query: list[str]) -> tuple[list[str]]:

     model = get_model(model_name, trust_remote_code)
     index = get_index(dir, search_time_s)
     if torch.cuda.is_available():
         model = model.half().cuda() if fp16 else model.bfloat16().cuda()
         # TODO: if huggingface datasets exposes an fp16 gpu option, use it here
     elif fp16:
         print('warning: used "FP16" on CPU-only system, ignoring...', file=stderr)
+    model.compile(mode="reduce-overhead")
     # function signature: (expanded tuple of input batches) -> tuple of output batches
     def search(query: list[str]) -> tuple[list[str]]: