Kalyani8 commited on
Commit
78f2fc1
·
verified ·
1 Parent(s): 4d9921f

Update app.py

Browse files

Creating embeddings & indexing for docs that we have already loaded

Files changed (1) hide show
  1. app.py +19 -0
app.py CHANGED
@@ -1,4 +1,9 @@
1
  from datasets import load_dataset
 
 
 
 
 
2
  # Load a small subset (12,000 rows)
3
  dataset = load_dataset("wiki40b", "en", split="train[:12000]")
4
 
@@ -6,3 +11,17 @@ dataset = load_dataset("wiki40b", "en", split="train[:12000]")
6
  docs = [d["text"] for d in dataset]
7
 
8
  print("Loaded dataset with", len(docs), "documents.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from datasets import load_dataset
2
+ from sentence_transformers import SentenceTransformer
3
+ import faiss
4
+ import numpy as np
5
+
6
+
7
  # Load a small subset (12,000 rows)
8
  dataset = load_dataset("wiki40b", "en", split="train[:12000]")
9
 
 
11
  docs = [d["text"] for d in dataset]
12
 
13
  print("Loaded dataset with", len(docs), "documents.")
14
+
15
+ # Load embedding model
16
+ embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
17
+
18
+ # Convert texts to embeddings
19
+ embeddings = embed_model.encode(docs, show_progress_bar=True)
20
+
21
+ # Store in FAISS index
22
+ dimension = embeddings.shape[1]
23
+ index = faiss.IndexFlatL2(dimension)
24
+ index.add(np.array(embeddings))
25
+
26
+ print("Stored embeddings in FAISS!")
27
+