Namitg02 commited on
Commit
401cca7
·
verified ·
1 Parent(s): 40d551d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -12
app.py CHANGED
@@ -16,34 +16,38 @@ from transformers import AutoModelForCausalLM
16
  from transformers import TextIteratorStreamer
17
  from threading import Thread
18
 
 
 
 
19
 
20
 
21
- #dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
22
  #dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
23
- dataset = load_dataset("epfl-llm/guidelines", split='train')
24
 
25
  #Returns a list of dictionaries, each representing a row in the dataset.
26
  print(dataset[1])
27
- print(dataset)
28
- # splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25,separators=["\n"]) # ["\n\n", "\n", " ", ""])
29
 
30
 
31
- #docs = splitter.create_documents(str(dataset))
32
  # Returns a list of documents
33
- #print(docs)
34
  embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
35
  #all-MiniLM-L6-v2, BAAI/bge-base-en-v1.5,infgrad/stella-base-en-v2, BAAI/bge-large-en-v1.5 working with default dimensions
36
  #docs_text = [doc.text for doc in docs]
37
  #embed = embedding_model.embed_documents(docs_text)
 
38
  embedding_dim = embedding_model.get_sentence_embedding_dimension()
39
  print(embedding_dim)
40
  #data = FAISS.from_embeddings(embed, embedding_model)
41
  #data = FAISS.from_texts(docs, embedding_model)
42
 
43
  # Returns a FAISS wrapper vector store. Input is a list of strings. from_documents method used documents to Return VectorStore
44
-
45
- data = dataset["clean_text"]
46
- #data = dataset["train"]
47
 
48
  #print(data)
49
  d = 384 # vectors dimension
@@ -65,9 +69,6 @@ If you don't know the answer, just say "I do not know." Don't make up an answer.
65
  print("check2")
66
 
67
 
68
- llm_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
69
- tokenizer = AutoTokenizer.from_pretrained(llm_model)
70
- # pulling tokeinzer for text generation model
71
  model = AutoModelForCausalLM.from_pretrained(llm_model)
72
  # Initializing the text generation model
73
 
 
16
  from transformers import TextIteratorStreamer
17
  from threading import Thread
18
 
19
+ llm_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
20
+ tokenizer = AutoTokenizer.from_pretrained(llm_model)
21
+ # pulling tokeinzer for text generation model
22
 
23
 
24
+ dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
25
  #dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
26
+ #dataset = load_dataset("epfl-llm/guidelines", split='train')
27
 
28
  #Returns a list of dictionaries, each representing a row in the dataset.
29
  print(dataset[1])
30
+ dataset.features
31
+ splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25) # ["\n\n", "\n", " ", ""])
32
 
33
 
34
+ docs = splitter.create_documents(str(dataset))
35
  # Returns a list of documents
36
+ print(docs)
37
  embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
38
  #all-MiniLM-L6-v2, BAAI/bge-base-en-v1.5,infgrad/stella-base-en-v2, BAAI/bge-large-en-v1.5 working with default dimensions
39
  #docs_text = [doc.text for doc in docs]
40
  #embed = embedding_model.embed_documents(docs_text)
41
+ embeddings = embedding_model.encode(docs)
42
  embedding_dim = embedding_model.get_sentence_embedding_dimension()
43
  print(embedding_dim)
44
  #data = FAISS.from_embeddings(embed, embedding_model)
45
  #data = FAISS.from_texts(docs, embedding_model)
46
 
47
  # Returns a FAISS wrapper vector store. Input is a list of strings. from_documents method used documents to Return VectorStore
48
+ # add_embeddings
49
+ #data = dataset["clean_text"]
50
+ data = dataset["text"]
51
 
52
  #print(data)
53
  d = 384 # vectors dimension
 
69
  print("check2")
70
 
71
 
 
 
 
72
  model = AutoModelForCausalLM.from_pretrained(llm_model)
73
  # Initializing the text generation model
74