Spaces:
Runtime error
Runtime error
Update app.py
Browse filesCreating embeddings & indexing for docs that we have already loaded
app.py
CHANGED
@@ -1,4 +1,9 @@
|
|
1 |
from datasets import load_dataset
|
|
|
|
|
|
|
|
|
|
|
2 |
# Load a small subset (12,000 rows)
|
3 |
dataset = load_dataset("wiki40b", "en", split="train[:12000]")
|
4 |
|
@@ -6,3 +11,17 @@ dataset = load_dataset("wiki40b", "en", split="train[:12000]")
|
|
6 |
docs = [d["text"] for d in dataset]
|
7 |
|
8 |
print("Loaded dataset with", len(docs), "documents.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from datasets import load_dataset
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
import faiss
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
# Load a small subset (12,000 rows)
|
8 |
dataset = load_dataset("wiki40b", "en", split="train[:12000]")
|
9 |
|
|
|
11 |
docs = [d["text"] for d in dataset]
|
12 |
|
13 |
print("Loaded dataset with", len(docs), "documents.")
|
14 |
+
|
15 |
+
# Load embedding model
|
16 |
+
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
17 |
+
|
18 |
+
# Convert texts to embeddings
|
19 |
+
embeddings = embed_model.encode(docs, show_progress_bar=True)
|
20 |
+
|
21 |
+
# Store in FAISS index
|
22 |
+
dimension = embeddings.shape[1]
|
23 |
+
index = faiss.IndexFlatL2(dimension)
|
24 |
+
index.add(np.array(embeddings))
|
25 |
+
|
26 |
+
print("Stored embeddings in FAISS!")
|
27 |
+
|