Update app.py
Browse files
app.py
CHANGED
@@ -28,9 +28,10 @@ dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
|
|
28 |
#Returns a list of dictionaries, each representing a row in the dataset.
|
29 |
print(dataset[1])
|
30 |
dataset.features
|
|
|
31 |
#Itemdetails = dataset.items()
|
32 |
#print(Itemdetails)
|
33 |
-
splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25) # ["\n\n", "\n", " ", ""])
|
34 |
|
35 |
#docs = splitter.create_documents(str(dataset))
|
36 |
|
@@ -46,8 +47,16 @@ embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
|
|
46 |
|
47 |
#doc_func = lambda x: x.text
|
48 |
#dataset = list(map(doc_func, dataset))
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
print(embeddings)
|
|
|
51 |
#def embedder(dataset[i]):
|
52 |
# return embedding_model.encode(dataset[i])
|
53 |
|
@@ -57,7 +66,7 @@ print(dataset[2])
|
|
57 |
#embeddings = embedding_model.encode(dataset)
|
58 |
|
59 |
#embeddings = embedding_model.embed_documents(docs)
|
60 |
-
|
61 |
embedding_dim = embedding_model.get_sentence_embedding_dimension()
|
62 |
print(dataset[1])
|
63 |
#data = FAISS.from_embeddings(embed, embedding_model)
|
|
|
28 |
#Returns a list of dictionaries, each representing a row in the dataset.
|
29 |
print(dataset[1])
|
30 |
dataset.features
|
31 |
+
length = len(dataset)
|
32 |
#Itemdetails = dataset.items()
|
33 |
#print(Itemdetails)
|
34 |
+
#splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25) # ["\n\n", "\n", " ", ""])
|
35 |
|
36 |
#docs = splitter.create_documents(str(dataset))
|
37 |
|
|
|
47 |
|
48 |
#doc_func = lambda x: x.text
|
49 |
#dataset = list(map(doc_func, dataset))
|
50 |
+
|
51 |
+
def embedder(dataset):
|
52 |
+
embeddings = embedding_model.encode(dataset[0]["text"])
|
53 |
+
dataset = dataset.add_column('embeddings', embeddings)
|
54 |
+
return dataset
|
55 |
+
updated_dataset = dataset.map(embedder)
|
56 |
+
dataset['text'][:length]
|
57 |
+
|
58 |
print(embeddings)
|
59 |
+
|
60 |
#def embedder(dataset[i]):
|
61 |
# return embedding_model.encode(dataset[i])
|
62 |
|
|
|
66 |
#embeddings = embedding_model.encode(dataset)
|
67 |
|
68 |
#embeddings = embedding_model.embed_documents(docs)
|
69 |
+
|
70 |
embedding_dim = embedding_model.get_sentence_embedding_dimension()
|
71 |
print(dataset[1])
|
72 |
#data = FAISS.from_embeddings(embed, embedding_model)
|