Namitg02 commited on
Commit
d2de7c8
·
verified ·
1 Parent(s): 2709754

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -7
app.py CHANGED
@@ -19,11 +19,12 @@ from threading import Thread
19
 
20
 
21
  #dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
22
- dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
 
23
 
24
- # Returns a list of dictionaries, each representing a row in the dataset.
25
  #print(dataset[1])
26
- #splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25,separators=["\n\n"]) # ["\n\n", "\n", " ", ""])
27
 
28
 
29
  #docs = splitter.create_documents(str(dataset))
@@ -40,16 +41,16 @@ print(embedding_dim)
40
 
41
  # Returns a FAISS wrapper vector store. Input is a list of strings. from_documents method used documents to Return VectorStore
42
 
43
- #data = dataset["text"]
44
- data = dataset["train"]
45
 
46
- print(data)
47
  d = 384 # vectors dimension
48
  m = 32 # hnsw parameter. Higher is more accurate but takes more time to index (default is 32, 128 should be ok)
49
  #index = faiss.IndexHNSWFlat(d, m)
50
  #index = faiss.IndexFlatL2(embedding_dim)
51
  #data.add_faiss_index(embeddings.shape[1], custom_index=index)
52
- data.add_faiss_index("embeddings")
53
  # adds an index column that for the embeddings
54
 
55
  print("check1")
 
19
 
20
 
21
  #dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
22
+ #dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
23
+ dataset = load_dataset("epfl-llm/guidelines"",)
24
 
25
+ #Returns a list of dictionaries, each representing a row in the dataset.
26
  #print(dataset[1])
27
+ # splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25,separators=["\n"]) # ["\n\n", "\n", " ", ""])
28
 
29
 
30
  #docs = splitter.create_documents(str(dataset))
 
41
 
42
  # Returns a FAISS wrapper vector store. Input is a list of strings. from_documents method used documents to Return VectorStore
43
 
44
+ data = dataset["clean_text"]
45
+ #data = dataset["train"]
46
 
47
+ #print(data)
48
  d = 384 # vectors dimension
49
  m = 32 # hnsw parameter. Higher is more accurate but takes more time to index (default is 32, 128 should be ok)
50
  #index = faiss.IndexHNSWFlat(d, m)
51
  #index = faiss.IndexFlatL2(embedding_dim)
52
  #data.add_faiss_index(embeddings.shape[1], custom_index=index)
53
+ data.add_faiss_index("embeddings")
54
  # adds an index column that for the embeddings
55
 
56
  print("check1")