Spaces:

colonelwatch
/

abstracts-index

Running on Zero

App Files Files Community

colonelwatch commited on Mar 8

Commit

65eeead

1 Parent(s): 561e8f7

Drop batching because ZeroGPU crashes with it enabled

Browse files

Files changed (1) hide show

app.py +15 -26

app.py CHANGED Viewed

@@ -275,35 +275,23 @@ def main():
         print('warning: used "FP16" on CPU-only system, ignoring...', file=stderr)
     model.compile(mode="reduce-overhead")
-    def encode(query: list[str]) -> npt.NDArray[np.float16 | np.float32]:
-        return model.encode(query, prompt_name, normalize_embeddings=normalize)
     if spaces:
         encode = spaces.GPU(encode)
-    # function signature: (expanded tuple of input batches) -> tuple of output batches
-    def search(query: list[str]) -> tuple[list[str]]:
         query_embedding = encode(query)
-        distances, faiss_ids = index.search_batch("embedding", query_embedding, k)
-        faiss_ids_flat = list(chain(*faiss_ids))
-        openalex_ids_flat = index[faiss_ids_flat]["id"]
-        works_flat = execute_request(openalex_ids_flat, mailto)
-        temp: list[Work] = []
-        works: list[list[Work]] = []
-        for work in works_flat:
-            temp.append(work)
-            if len(temp) == k:
-                works.append(temp)
-                temp = []
-        assert not temp, "request a multiple of k IDs, did not get a multiple back"
-        result_strings = [
-            format_response(w, d, calculate_similarity=normalize)
-            for w, d in zip(works, distances)
-        ]
-        return (result_strings, )
     with gr.Blocks() as demo:
         # figure out the words to describe the quantity
@@ -349,8 +337,9 @@ def main():
             container=True,
         )
-        query.submit(search, inputs=[query], outputs=[results], batch=True)
-        btn.click(search, inputs=[query], outputs=[results], batch=True)
     demo.queue()
     demo.launch()

         print('warning: used "FP16" on CPU-only system, ignoring...', file=stderr)
     model.compile(mode="reduce-overhead")
+    # TODO: use something like the encode_faster function from the main repo to minimize
+    #       alloc'd GPU time
+    def encode(query: str) -> npt.NDArray[np.float16 | np.float32]:
+        return model.encode(
+            query, prompt_name, convert_to_numpy=True, normalize_embeddings=normalize
+        )
     if spaces:
         encode = spaces.GPU(encode)
+    def search(query: str) -> str:
         query_embedding = encode(query)
+        distances, faiss_ids = index.search("embedding", query_embedding, k)
+        openalex_ids = index[faiss_ids]["id"]
+        works = execute_request(openalex_ids, mailto)
+        return format_response(works, distances, calculate_similarity=normalize)
     with gr.Blocks() as demo:
         # figure out the words to describe the quantity
             container=True,
         )
+        # NOTE: ZeroGPU doesn't seem to support batching
+        query.submit(search, inputs=[query], outputs=[results])
+        btn.click(search, inputs=[query], outputs=[results])
     demo.queue()
     demo.launch()