Namitg02 commited on
Commit
3b72fd6
·
verified ·
1 Parent(s): 91864a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -15
app.py CHANGED
@@ -16,11 +16,13 @@ llm_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
16
  tokenizer = AutoTokenizer.from_pretrained(llm_model)
17
  # pulling tokeinzer for text generation model
18
 
 
 
 
 
19
 
20
- #datasetiter = load_dataset("Namitg02/Test", split='train', streaming=False)
21
- #dataset = list(datasetiter)
22
  #dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
23
- dataset = load_dataset("epfl-llm/guidelines", split='train')
24
  #Returns a list of dictionaries, each representing a row in the dataset.
25
  print(dataset[1])
26
  length = len(dataset)
@@ -35,27 +37,27 @@ embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
35
  #doc_func = lambda x: x.text
36
  #dataset = list(map(doc_func, dataset))
37
 
38
- #def embedder(dataset):
39
- # embeddings = embedding_model.encode(dataset["text"])
40
- # dataset = dataset.add_column('embeddings', embeddings)
41
- # return dataset
42
- #updated_dataset = dataset.map(embedder)
43
- #dataset['text'][:length]
44
 
45
  #print(embeddings)
46
 
47
- #print(updated_dataset[1])
48
- #print(updated_dataset[2])
49
- #print(dataset[1])
50
 
51
- #embedding_dim = embedding_model.get_sentence_embedding_dimension()
52
  #data = FAISS.from_embeddings(embed, embedding_model)
53
  #data = FAISS.from_texts(docs, embedding_model)
54
 
55
  # Returns a FAISS wrapper vector store. Input is a list of strings. from_documents method used documents to Return VectorStore
56
  # add_embeddings
57
- data = dataset["clean_text"]
58
- #data = updated_dataset["text"]
59
 
60
  #print(data)
61
  d = 384 # vectors dimension
 
16
  tokenizer = AutoTokenizer.from_pretrained(llm_model)
17
  # pulling tokeinzer for text generation model
18
 
19
+ import numpy as np
20
+
21
+ datasetiter = load_dataset("Namitg02/Test", split='train', streaming=False)
22
+ dataset = np.array(list(datasetiter))
23
 
 
 
24
  #dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
25
+ #dataset = load_dataset("epfl-llm/guidelines", split='train')
26
  #Returns a list of dictionaries, each representing a row in the dataset.
27
  print(dataset[1])
28
  length = len(dataset)
 
37
  #doc_func = lambda x: x.text
38
  #dataset = list(map(doc_func, dataset))
39
 
40
+ def embedder(dataset):
41
+ embeddings = embedding_model.encode(dataset["text"])
42
+ dataset = dataset.add_column('embeddings', embeddings)
43
+ return dataset
44
+ updated_dataset = dataset.map(embedder)
45
+ dataset['text'][:length]
46
 
47
  #print(embeddings)
48
 
49
+ print(updated_dataset[1])
50
+ print(updated_dataset[2])
51
+ print(dataset[1])
52
 
53
+ embedding_dim = embedding_model.get_sentence_embedding_dimension()
54
  #data = FAISS.from_embeddings(embed, embedding_model)
55
  #data = FAISS.from_texts(docs, embedding_model)
56
 
57
  # Returns a FAISS wrapper vector store. Input is a list of strings. from_documents method used documents to Return VectorStore
58
  # add_embeddings
59
+ #data = dataset["clean_text"]
60
+ data = updated_dataset["text"]
61
 
62
  #print(data)
63
  d = 384 # vectors dimension