Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
c574006
1
Parent(s):
fd084a4
Compile the model for lower latency
Browse files
app.py
CHANGED
@@ -219,11 +219,13 @@ def main():
|
|
219 |
|
220 |
model = get_model(model_name, trust_remote_code)
|
221 |
index = get_index(dir, search_time_s)
|
|
|
222 |
if torch.cuda.is_available():
|
223 |
model = model.half().cuda() if fp16 else model.bfloat16().cuda()
|
224 |
# TODO: if huggingface datasets exposes an fp16 gpu option, use it here
|
225 |
elif fp16:
|
226 |
print('warning: used "FP16" on CPU-only system, ignoring...', file=stderr)
|
|
|
227 |
|
228 |
# function signature: (expanded tuple of input batches) -> tuple of output batches
|
229 |
def search(query: list[str]) -> tuple[list[str]]:
|
|
|
219 |
|
220 |
model = get_model(model_name, trust_remote_code)
|
221 |
index = get_index(dir, search_time_s)
|
222 |
+
|
223 |
if torch.cuda.is_available():
|
224 |
model = model.half().cuda() if fp16 else model.bfloat16().cuda()
|
225 |
# TODO: if huggingface datasets exposes an fp16 gpu option, use it here
|
226 |
elif fp16:
|
227 |
print('warning: used "FP16" on CPU-only system, ignoring...', file=stderr)
|
228 |
+
model.compile(mode="reduce-overhead")
|
229 |
|
230 |
# function signature: (expanded tuple of input batches) -> tuple of output batches
|
231 |
def search(query: list[str]) -> tuple[list[str]]:
|