Namitg02 commited on
Commit
ec8bfb4
·
verified ·
1 Parent(s): 4def30d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -6
app.py CHANGED
@@ -32,20 +32,23 @@ dataset.features
32
  #print(Itemdetails)
33
  splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25) # ["\n\n", "\n", " ", ""])
34
 
 
35
 
36
- docs = splitter.create_documents(str(dataset))
37
  # Returns a list of documents
38
  print(docs)
39
- #embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
40
- embedding_model = HuggingFaceEmbeddings(model_name = "mixedbread-ai/mxbai-embed-large-v1")
41
  #all-MiniLM-L6-v2, BAAI/bge-base-en-v1.5,infgrad/stella-base-en-v2, BAAI/bge-large-en-v1.5 working with default dimensions
42
  #docs_text = [doc.text for doc in docs]
43
  #embed = embedding_model.embed_documents(docs_text)
44
  #embeddings = embedding_model.encode(docs)
 
45
 
46
- doc_func = lambda x: x.text
47
- dataset = list(map(doc_func, dataset))
48
- embeddings = embedding_model.embed_documents(dataset)
 
 
49
  #embeddings = embedding_model.embed_documents(docs)
50
  dataset = dataset.add_column('embeddings', embeddings)
51
  embedding_dim = embedding_model.get_sentence_embedding_dimension()
 
32
  #print(Itemdetails)
33
  splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25) # ["\n\n", "\n", " ", ""])
34
 
35
+ #docs = splitter.create_documents(str(dataset))
36
 
 
37
  # Returns a list of documents
38
  print(docs)
39
+ embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
40
+ #embedding_model = HuggingFaceEmbeddings(model_name = "mixedbread-ai/mxbai-embed-large-v1")
41
  #all-MiniLM-L6-v2, BAAI/bge-base-en-v1.5,infgrad/stella-base-en-v2, BAAI/bge-large-en-v1.5 working with default dimensions
42
  #docs_text = [doc.text for doc in docs]
43
  #embed = embedding_model.embed_documents(docs_text)
44
  #embeddings = embedding_model.encode(docs)
45
+ embeddings = torch.from_numpy(dataset["train"].to_pandas().to_numpy()).to(torch.float)
46
 
47
+
48
+
49
+ #doc_func = lambda x: x.text
50
+ #dataset = list(map(doc_func, dataset))
51
+ #embeddings = embedding_model.embed_documents(dataset)
52
  #embeddings = embedding_model.embed_documents(docs)
53
  dataset = dataset.add_column('embeddings', embeddings)
54
  embedding_dim = embedding_model.get_sentence_embedding_dimension()