Update app.py
Browse files
app.py
CHANGED
@@ -1,22 +1,11 @@
|
|
1 |
-
from getpass import getpass
|
2 |
-
ACCESS_TOKEN = getpass(token = "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")
|
3 |
-
base_url = "https://github.com/Namitg02/Diabeteschatbot"
|
4 |
-
|
5 |
from datasets import load_dataset
|
6 |
-
dataset = load_dataset("text",prompt= base_url, stream=None)
|
7 |
-
print(dataset[1])
|
8 |
-
|
9 |
from langchain.docstore.document import Document as LangchainDocument
|
10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
from sentence_transformers import SentenceTransformer
|
12 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
13 |
-
#from langchain_community.vectorstores import faiss
|
14 |
import faiss
|
15 |
from langchain.prompts import PromptTemplate
|
16 |
-
|
17 |
-
#from transformers import pipeline
|
18 |
-
#from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
|
19 |
-
#from langchain_core.messages import SystemMessage
|
20 |
import time
|
21 |
from transformers import AutoTokenizer
|
22 |
from transformers import AutoModelForCausalLM
|
@@ -28,62 +17,45 @@ tokenizer = AutoTokenizer.from_pretrained(llm_model)
|
|
28 |
# pulling tokeinzer for text generation model
|
29 |
|
30 |
|
31 |
-
datasetiter = load_dataset("Namitg02/Test", split='train', streaming=False)
|
32 |
dataset = list(datasetiter)
|
33 |
#dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
|
34 |
-
|
35 |
-
|
36 |
#Returns a list of dictionaries, each representing a row in the dataset.
|
37 |
print(dataset[1])
|
38 |
-
#dataset.features
|
39 |
length = len(dataset)
|
|
|
40 |
#Itemdetails = dataset.items()
|
41 |
#print(Itemdetails)
|
42 |
-
#splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25) # ["\n\n", "\n", " ", ""])
|
43 |
|
44 |
-
#docs = splitter.create_documents(str(dataset))
|
45 |
-
|
46 |
-
# Returns a list of documents
|
47 |
-
#print(docs)
|
48 |
embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
|
49 |
#embedding_model = HuggingFaceEmbeddings(model_name = "mixedbread-ai/mxbai-embed-large-v1")
|
50 |
#all-MiniLM-L6-v2, BAAI/bge-base-en-v1.5,infgrad/stella-base-en-v2, BAAI/bge-large-en-v1.5 working with default dimensions
|
51 |
-
#docs_text = [doc.text for doc in docs]
|
52 |
-
#embed = embedding_model.embed_documents(docs_text)
|
53 |
-
|
54 |
-
#embeddings = embedding_model.encode(docs)
|
55 |
|
56 |
#doc_func = lambda x: x.text
|
57 |
#dataset = list(map(doc_func, dataset))
|
58 |
|
59 |
-
def embedder(dataset):
|
60 |
-
embeddings = embedding_model.encode(dataset["text"])
|
61 |
-
dataset = dataset.add_column('embeddings', embeddings)
|
62 |
-
return dataset
|
63 |
-
updated_dataset = dataset.map(embedder)
|
64 |
-
dataset['text'][:length]
|
65 |
-
|
66 |
-
print(embeddings)
|
67 |
|
68 |
-
#
|
69 |
-
# return embedding_model.encode(dataset[i])
|
70 |
-
|
71 |
-
#dataset = dataset.map(embedder, batched=True)
|
72 |
-
print(dataset[1])
|
73 |
-
print(dataset[2])
|
74 |
-
#embeddings = embedding_model.encode(dataset)
|
75 |
|
76 |
-
#
|
|
|
|
|
77 |
|
78 |
-
embedding_dim = embedding_model.get_sentence_embedding_dimension()
|
79 |
-
print(dataset[1])
|
80 |
#data = FAISS.from_embeddings(embed, embedding_model)
|
81 |
#data = FAISS.from_texts(docs, embedding_model)
|
82 |
|
83 |
# Returns a FAISS wrapper vector store. Input is a list of strings. from_documents method used documents to Return VectorStore
|
84 |
# add_embeddings
|
85 |
-
|
86 |
-
data =
|
87 |
|
88 |
#print(data)
|
89 |
d = 384 # vectors dimension
|
|
|
|
|
|
|
|
|
|
|
1 |
from datasets import load_dataset
|
|
|
|
|
|
|
2 |
from langchain.docstore.document import Document as LangchainDocument
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
|
6 |
import faiss
|
7 |
from langchain.prompts import PromptTemplate
|
8 |
+
|
|
|
|
|
|
|
9 |
import time
|
10 |
from transformers import AutoTokenizer
|
11 |
from transformers import AutoModelForCausalLM
|
|
|
17 |
# pulling tokeinzer for text generation model
|
18 |
|
19 |
|
20 |
+
#datasetiter = load_dataset("Namitg02/Test", split='train', streaming=False)
|
21 |
dataset = list(datasetiter)
|
22 |
#dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
|
23 |
+
dataset = load_dataset("epfl-llm/guidelines", split='train')
|
|
|
24 |
#Returns a list of dictionaries, each representing a row in the dataset.
|
25 |
print(dataset[1])
|
|
|
26 |
length = len(dataset)
|
27 |
+
|
28 |
#Itemdetails = dataset.items()
|
29 |
#print(Itemdetails)
|
|
|
30 |
|
|
|
|
|
|
|
|
|
31 |
embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
|
32 |
#embedding_model = HuggingFaceEmbeddings(model_name = "mixedbread-ai/mxbai-embed-large-v1")
|
33 |
#all-MiniLM-L6-v2, BAAI/bge-base-en-v1.5,infgrad/stella-base-en-v2, BAAI/bge-large-en-v1.5 working with default dimensions
|
|
|
|
|
|
|
|
|
34 |
|
35 |
#doc_func = lambda x: x.text
|
36 |
#dataset = list(map(doc_func, dataset))
|
37 |
|
38 |
+
#def embedder(dataset):
|
39 |
+
# embeddings = embedding_model.encode(dataset["text"])
|
40 |
+
# dataset = dataset.add_column('embeddings', embeddings)
|
41 |
+
# return dataset
|
42 |
+
#updated_dataset = dataset.map(embedder)
|
43 |
+
#dataset['text'][:length]
|
|
|
|
|
44 |
|
45 |
+
#print(embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
#print(updated_dataset[1])
|
48 |
+
#print(updated_dataset[2])
|
49 |
+
#print(dataset[1])
|
50 |
|
51 |
+
#embedding_dim = embedding_model.get_sentence_embedding_dimension()
|
|
|
52 |
#data = FAISS.from_embeddings(embed, embedding_model)
|
53 |
#data = FAISS.from_texts(docs, embedding_model)
|
54 |
|
55 |
# Returns a FAISS wrapper vector store. Input is a list of strings. from_documents method used documents to Return VectorStore
|
56 |
# add_embeddings
|
57 |
+
data = dataset["clean_text"]
|
58 |
+
#data = updated_dataset["text"]
|
59 |
|
60 |
#print(data)
|
61 |
d = 384 # vectors dimension
|