Spaces:

colonelwatch
/

abstracts-index

Running on Zero

App Files Files Community

colonelwatch commited on 23 days ago

Commit

303669c

1 Parent(s): 65eeead

Minimize ZeroGPU utilization time by cutting out SentenceTransformer overhead

Browse files

Files changed (1) hide show

app.py +40 -8

app.py CHANGED Viewed

@@ -267,6 +267,20 @@ def main():
     normalize, model = get_model(model_name, dir, trust_remote_code)
     index = get_index(dir, search_time_s)
     model.eval()
     if torch.cuda.is_available():
         model = model.half().cuda() if fp16 else model.bfloat16().cuda()
@@ -275,17 +289,35 @@ def main():
         print('warning: used "FP16" on CPU-only system, ignoring...', file=stderr)
     model.compile(mode="reduce-overhead")
-    # TODO: use something like the encode_faster function from the main repo to minimize
-    #       alloc'd GPU time
-    def encode(query: str) -> npt.NDArray[np.float16 | np.float32]:
-        return model.encode(
-            query, prompt_name, convert_to_numpy=True, normalize_embeddings=normalize
-        )
     if spaces:
-        encode = spaces.GPU(encode)
     def search(query: str) -> str:
-        query_embedding = encode(query)
         distances, faiss_ids = index.search("embedding", query_embedding, k)
         openalex_ids = index[faiss_ids]["id"]

     normalize, model = get_model(model_name, dir, trust_remote_code)
     index = get_index(dir, search_time_s)
+    # follow model.encode logic for acquiring the prompt
+    if prompt_name is None and model.default_prompt_name is not None:
+        prompt_name = model.default_prompt_name
+        if not isinstance(prompt_name, str):
+            raise TypeError("invalid prompt name type")
+    prompt: str | None = model.prompts[prompt_name] if prompt_name is not None else None
+    # follow model.encode logic for setting extra_features
+    extra_features: dict[str, Any] = {}
+    if prompt is not None:
+        tokenized = model.tokenize([prompt])
+        if "input_ids" in tokenized:
+            extra_features["prompt_length"] = tokenized["input_ids"].shape[-1] - 1
     model.eval()
     if torch.cuda.is_available():
         model = model.half().cuda() if fp16 else model.bfloat16().cuda()
         print('warning: used "FP16" on CPU-only system, ignoring...', file=stderr)
     model.compile(mode="reduce-overhead")
+    def encode_tokens(features: dict[str, Any]) -> npt.NDArray[np.float32]:
+        # Tokenize (which yields a dict) then do a non-blocking transfer
+        features = {
+            k: v.to(model.device, non_blocking=True) for k, v in features.items()
+        } | extra_features
+        with torch.no_grad():
+            out_features = model.forward(features)
+            embeddings = out_features["sentence_embedding"]
+            embeddings = embeddings[0]
+            if model.truncate_dim:
+                embeddings = embeddings[:model.truncate_dim]
+            if normalize:
+                embeddings = torch.nn.functional.normalize(embeddings, dim=0)
+        return embeddings.cpu().float().numpy()  # faiss expected CPU float32 numpy arr
     if spaces:
+        encode_tokens = spaces.GPU(encode_tokens)
+    def encode_string(query: str) -> npt.NDArray[np.float32]:
+        if prompt:
+            query = prompt + query
+        tokens = model.tokenize([query])
+        return encode_tokens(tokens)
     def search(query: str) -> str:
+        query_embedding = encode_string(query)
         distances, faiss_ids = index.search("embedding", query_embedding, k)
         openalex_ids = index[faiss_ids]["id"]