Namitg02 commited on
Commit
4ccccb3
·
verified ·
1 Parent(s): 5740cb1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -45
app.py CHANGED
@@ -1,22 +1,11 @@
1
- from getpass import getpass
2
- ACCESS_TOKEN = getpass(token = "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")
3
- base_url = "https://github.com/Namitg02/Diabeteschatbot"
4
-
5
  from datasets import load_dataset
6
- dataset = load_dataset("text",prompt= base_url, stream=None)
7
- print(dataset[1])
8
-
9
  from langchain.docstore.document import Document as LangchainDocument
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from sentence_transformers import SentenceTransformer
12
  from langchain_community.embeddings import HuggingFaceEmbeddings
13
- #from langchain_community.vectorstores import faiss
14
  import faiss
15
  from langchain.prompts import PromptTemplate
16
- #from langchain.chains import ConversationalRetrievalChain
17
- #from transformers import pipeline
18
- #from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
19
- #from langchain_core.messages import SystemMessage
20
  import time
21
  from transformers import AutoTokenizer
22
  from transformers import AutoModelForCausalLM
@@ -28,62 +17,45 @@ tokenizer = AutoTokenizer.from_pretrained(llm_model)
28
  # pulling tokeinzer for text generation model
29
 
30
 
31
- datasetiter = load_dataset("Namitg02/Test", split='train', streaming=False)
32
  dataset = list(datasetiter)
33
  #dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
34
- #dataset = load_dataset("epfl-llm/guidelines", split='train')
35
-
36
  #Returns a list of dictionaries, each representing a row in the dataset.
37
  print(dataset[1])
38
- #dataset.features
39
  length = len(dataset)
 
40
  #Itemdetails = dataset.items()
41
  #print(Itemdetails)
42
- #splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25) # ["\n\n", "\n", " ", ""])
43
 
44
- #docs = splitter.create_documents(str(dataset))
45
-
46
- # Returns a list of documents
47
- #print(docs)
48
  embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
49
  #embedding_model = HuggingFaceEmbeddings(model_name = "mixedbread-ai/mxbai-embed-large-v1")
50
  #all-MiniLM-L6-v2, BAAI/bge-base-en-v1.5,infgrad/stella-base-en-v2, BAAI/bge-large-en-v1.5 working with default dimensions
51
- #docs_text = [doc.text for doc in docs]
52
- #embed = embedding_model.embed_documents(docs_text)
53
-
54
- #embeddings = embedding_model.encode(docs)
55
 
56
  #doc_func = lambda x: x.text
57
  #dataset = list(map(doc_func, dataset))
58
 
59
- def embedder(dataset):
60
- embeddings = embedding_model.encode(dataset["text"])
61
- dataset = dataset.add_column('embeddings', embeddings)
62
- return dataset
63
- updated_dataset = dataset.map(embedder)
64
- dataset['text'][:length]
65
-
66
- print(embeddings)
67
 
68
- #def embedder(dataset[i]):
69
- # return embedding_model.encode(dataset[i])
70
-
71
- #dataset = dataset.map(embedder, batched=True)
72
- print(dataset[1])
73
- print(dataset[2])
74
- #embeddings = embedding_model.encode(dataset)
75
 
76
- #embeddings = embedding_model.embed_documents(docs)
 
 
77
 
78
- embedding_dim = embedding_model.get_sentence_embedding_dimension()
79
- print(dataset[1])
80
  #data = FAISS.from_embeddings(embed, embedding_model)
81
  #data = FAISS.from_texts(docs, embedding_model)
82
 
83
  # Returns a FAISS wrapper vector store. Input is a list of strings. from_documents method used documents to Return VectorStore
84
  # add_embeddings
85
- #data = dataset["clean_text"]
86
- data = dataset["text"]
87
 
88
  #print(data)
89
  d = 384 # vectors dimension
 
 
 
 
 
1
  from datasets import load_dataset
 
 
 
2
  from langchain.docstore.document import Document as LangchainDocument
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from sentence_transformers import SentenceTransformer
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
 
6
  import faiss
7
  from langchain.prompts import PromptTemplate
8
+
 
 
 
9
  import time
10
  from transformers import AutoTokenizer
11
  from transformers import AutoModelForCausalLM
 
17
  # pulling tokeinzer for text generation model
18
 
19
 
20
+ #datasetiter = load_dataset("Namitg02/Test", split='train', streaming=False)
21
  dataset = list(datasetiter)
22
  #dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
23
+ dataset = load_dataset("epfl-llm/guidelines", split='train')
 
24
  #Returns a list of dictionaries, each representing a row in the dataset.
25
  print(dataset[1])
 
26
  length = len(dataset)
27
+
28
  #Itemdetails = dataset.items()
29
  #print(Itemdetails)
 
30
 
 
 
 
 
31
  embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
32
  #embedding_model = HuggingFaceEmbeddings(model_name = "mixedbread-ai/mxbai-embed-large-v1")
33
  #all-MiniLM-L6-v2, BAAI/bge-base-en-v1.5,infgrad/stella-base-en-v2, BAAI/bge-large-en-v1.5 working with default dimensions
 
 
 
 
34
 
35
  #doc_func = lambda x: x.text
36
  #dataset = list(map(doc_func, dataset))
37
 
38
+ #def embedder(dataset):
39
+ # embeddings = embedding_model.encode(dataset["text"])
40
+ # dataset = dataset.add_column('embeddings', embeddings)
41
+ # return dataset
42
+ #updated_dataset = dataset.map(embedder)
43
+ #dataset['text'][:length]
 
 
44
 
45
+ #print(embeddings)
 
 
 
 
 
 
46
 
47
+ #print(updated_dataset[1])
48
+ #print(updated_dataset[2])
49
+ #print(dataset[1])
50
 
51
+ #embedding_dim = embedding_model.get_sentence_embedding_dimension()
 
52
  #data = FAISS.from_embeddings(embed, embedding_model)
53
  #data = FAISS.from_texts(docs, embedding_model)
54
 
55
  # Returns a FAISS wrapper vector store. Input is a list of strings. from_documents method used documents to Return VectorStore
56
  # add_embeddings
57
+ data = dataset["clean_text"]
58
+ #data = updated_dataset["text"]
59
 
60
  #print(data)
61
  d = 384 # vectors dimension