Namitg02 commited on
Commit
73aea80
·
verified ·
1 Parent(s): e59f788

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -28
app.py CHANGED
@@ -21,49 +21,30 @@ tokenizer = AutoTokenizer.from_pretrained(llm_model)
21
  # pulling tokeinzer for text generation model
22
 
23
  dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
24
- #dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
25
  #dataset = load_dataset("epfl-llm/guidelines", split='train')
26
  #Returns a list of dictionaries, each representing a row in the dataset.
27
- #print(dataset[1])
28
  length = len(dataset)
29
 
30
- #Itemdetails = dataset.items()
31
- #print(Itemdetails)
32
-
33
  embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
34
- #embedding_model = HuggingFaceEmbeddings(model_name = "mixedbread-ai/mxbai-embed-large-v1")
35
  #all-MiniLM-L6-v2, BAAI/bge-base-en-v1.5,infgrad/stella-base-en-v2, BAAI/bge-large-en-v1.5 working with default dimensions
36
 
37
  df = pd.DataFrame(dataset)
38
- print(df.iloc[[1]])
39
-
40
  df['embeddings'] = df['text'].apply(lambda x: embedding_model.encode(x))
 
 
 
41
  print(df.iloc[[1]])
42
  dataset = Dataset.from_pandas(df)
43
- print(dataset[1])
44
- print(dataset[2])
45
 
46
- #doc_func = lambda x: x.text
47
- #dataset = list(map(doc_func, dataset))
48
-
49
- #def embedder(dataset):
50
- # embeddings = embedding_model.encode(dataset["text"])
51
- # dataset = dataset.add_column('embeddings', embeddings)
52
- # return dataset
53
- #updated_dataset = dataset.map(embedder)
54
  #dataset['text'][:length]
55
 
56
- #print(embeddings)
57
-
58
- print(dataset[1])
59
 
60
  embedding_dim = embedding_model.get_sentence_embedding_dimension()
61
- #data = FAISS.from_embeddings(embed, embedding_model)
62
- #data = FAISS.from_texts(docs, embedding_model)
63
-
64
- # Returns a FAISS wrapper vector store. Input is a list of strings. from_documents method used documents to Return VectorStore
65
- # add_embeddings
66
- #data = dataset["clean_text"]
67
  data = dataset
68
 
69
  #print(data)
@@ -75,7 +56,7 @@ m = 32 # hnsw parameter. Higher is more accurate but takes more time to index (
75
  data.add_faiss_index("embeddings")
76
  # adds an index column for the embeddings
77
 
78
- print("check1")
79
  #question = "How can I reverse Diabetes?"
80
 
81
  SYS_PROMPT = """You are an assistant for answering questions.
 
21
  # pulling tokeinzer for text generation model
22
 
23
  dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
 
24
  #dataset = load_dataset("epfl-llm/guidelines", split='train')
25
  #Returns a list of dictionaries, each representing a row in the dataset.
 
26
  length = len(dataset)
27
 
 
 
 
28
  embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
 
29
  #all-MiniLM-L6-v2, BAAI/bge-base-en-v1.5,infgrad/stella-base-en-v2, BAAI/bge-large-en-v1.5 working with default dimensions
30
 
31
  df = pd.DataFrame(dataset)
32
+ #print(df.iloc[[1]])
33
+ print(check1)
34
  df['embeddings'] = df['text'].apply(lambda x: embedding_model.encode(x))
35
+ # add_embeddings as a new column
36
+
37
+ print(check1a)
38
  print(df.iloc[[1]])
39
  dataset = Dataset.from_pandas(df)
40
+ print(check1b)
 
41
 
 
 
 
 
 
 
 
 
42
  #dataset['text'][:length]
43
 
44
+ print(dataset[1c])
 
 
45
 
46
  embedding_dim = embedding_model.get_sentence_embedding_dimension()
47
+ # Returns dimensions of embedidngs
 
 
 
 
 
48
  data = dataset
49
 
50
  #print(data)
 
56
  data.add_faiss_index("embeddings")
57
  # adds an index column for the embeddings
58
 
59
+ print("check1d")
60
  #question = "How can I reverse Diabetes?"
61
 
62
  SYS_PROMPT = """You are an assistant for answering questions.