Spaces:

colonelwatch
/

abstracts-index

Running on Zero

colonelwatch commited on Dec 30, 2024

Commit

2db96ca

1 Parent(s): 67dc9b0

Merge the index on disk to keep file sizes under 4GB

Files changed (2) hide show

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ temp.ivfdata

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ from typing import TypedDict, Self, Any, Callable
 from datasets import Dataset
 from datasets.search import FaissIndex
 import faiss
 import gradio as gr
 import requests
 from sentence_transformers import SentenceTransformer
@@ -127,12 +128,22 @@ def get_model(
     )
 def get_index(dir: Path, search_time_s: float) -> Dataset:
     # NOTE: a private attr is used to get the faiss.IO_FLAG_ONDISK_SAME_DIR flag!
     index: Dataset = Dataset.from_parquet(str(dir / "ids.parquet"))  # type: ignore
-    faiss_index: faiss.Index = faiss.read_index(
-        str(dir / "index.faiss"), faiss.IO_FLAG_ONDISK_SAME_DIR
-    )
     index._indexes["embedding"] = FaissIndex(None, None, None, faiss_index)
     with open(dir / "params.json", "r") as f:

 from datasets import Dataset
 from datasets.search import FaissIndex
 import faiss
+from faiss.contrib.ondisk import merge_ondisk
 import gradio as gr
 import requests
 from sentence_transformers import SentenceTransformer
     )
+def merge_shards(dir: Path) -> faiss.Index:
+    empty_path = dir / "empty.faiss"
+    shard_paths = [str(p) for p in dir.glob("shard_*.faiss")]
+    merged_ivfdata_path = Path("temp.ivfdata")
+    index = faiss.read_index(str(empty_path))
+    merged_ivfdata_path.unlink(missing_ok=True)  # overwrite previous if it exists  (TODO: do I need this?)
+    merge_ondisk(index, shard_paths, str(merged_ivfdata_path))
+    return index
 def get_index(dir: Path, search_time_s: float) -> Dataset:
     # NOTE: a private attr is used to get the faiss.IO_FLAG_ONDISK_SAME_DIR flag!
     index: Dataset = Dataset.from_parquet(str(dir / "ids.parquet"))  # type: ignore
+    faiss_index = merge_shards(dir / "shards")
     index._indexes["embedding"] = FaissIndex(None, None, None, faiss_index)
     with open(dir / "params.json", "r") as f: