Namitg02 commited on
Commit
63d701e
·
verified ·
1 Parent(s): f1f83d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -3
app.py CHANGED
@@ -28,9 +28,10 @@ dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
28
  #Returns a list of dictionaries, each representing a row in the dataset.
29
  print(dataset[1])
30
  dataset.features
 
31
  #Itemdetails = dataset.items()
32
  #print(Itemdetails)
33
- splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25) # ["\n\n", "\n", " ", ""])
34
 
35
  #docs = splitter.create_documents(str(dataset))
36
 
@@ -46,8 +47,16 @@ embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
46
 
47
  #doc_func = lambda x: x.text
48
  #dataset = list(map(doc_func, dataset))
49
- embeddings = embedding_model.encode(dataset[0]["text"])
 
 
 
 
 
 
 
50
  print(embeddings)
 
51
  #def embedder(dataset[i]):
52
  # return embedding_model.encode(dataset[i])
53
 
@@ -57,7 +66,7 @@ print(dataset[2])
57
  #embeddings = embedding_model.encode(dataset)
58
 
59
  #embeddings = embedding_model.embed_documents(docs)
60
- dataset = dataset.add_column('embeddings', embeddings)
61
  embedding_dim = embedding_model.get_sentence_embedding_dimension()
62
  print(dataset[1])
63
  #data = FAISS.from_embeddings(embed, embedding_model)
 
28
  #Returns a list of dictionaries, each representing a row in the dataset.
29
  print(dataset[1])
30
  dataset.features
31
+ length = len(dataset)
32
  #Itemdetails = dataset.items()
33
  #print(Itemdetails)
34
+ #splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25) # ["\n\n", "\n", " ", ""])
35
 
36
  #docs = splitter.create_documents(str(dataset))
37
 
 
47
 
48
  #doc_func = lambda x: x.text
49
  #dataset = list(map(doc_func, dataset))
50
+
51
+ def embedder(dataset):
52
+ embeddings = embedding_model.encode(dataset[0]["text"])
53
+ dataset = dataset.add_column('embeddings', embeddings)
54
+ return dataset
55
+ updated_dataset = dataset.map(embedder)
56
+ dataset['text'][:length]
57
+
58
  print(embeddings)
59
+
60
  #def embedder(dataset[i]):
61
  # return embedding_model.encode(dataset[i])
62
 
 
66
  #embeddings = embedding_model.encode(dataset)
67
 
68
  #embeddings = embedding_model.embed_documents(docs)
69
+
70
  embedding_dim = embedding_model.get_sentence_embedding_dimension()
71
  print(dataset[1])
72
  #data = FAISS.from_embeddings(embed, embedding_model)