Update app.py
Browse files
app.py
CHANGED
@@ -4,10 +4,16 @@ from datasets import load_dataset
|
|
4 |
dataset = load_dataset("Namitg02/Test")
|
5 |
print(dataset)
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=15,separators=["\n\n", "\n", " ", ""])
|
9 |
-
|
10 |
-
docs = splitter.split_text(str(dataset))
|
11 |
|
12 |
|
13 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
@@ -17,6 +23,7 @@ embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
|
17 |
|
18 |
from langchain_community.vectorstores import Chroma
|
19 |
persist_directory = 'docs/chroma/'
|
|
|
20 |
vectordb = Chroma.from_documents(
|
21 |
documents=[docs],
|
22 |
embedding=embedding_model,
|
|
|
4 |
dataset = load_dataset("Namitg02/Test")
|
5 |
print(dataset)
|
6 |
|
7 |
+
from langchain.docstore.document import Document as LangchainDocument
|
8 |
+
|
9 |
+
RAW_KNOWLEDGE_BASE = [
|
10 |
+
LangchainDocument(page_content=doc["dataset"], metadata={"one": doc["two"]})
|
11 |
+
]
|
12 |
+
|
13 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
14 |
splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=15,separators=["\n\n", "\n", " ", ""])
|
15 |
+
docs = splitter.split_documents(RAW_KNOWLEDGE_BASE)
|
16 |
+
#docs = splitter.split_text(str(dataset))
|
17 |
|
18 |
|
19 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
|
23 |
|
24 |
from langchain_community.vectorstores import Chroma
|
25 |
persist_directory = 'docs/chroma/'
|
26 |
+
|
27 |
vectordb = Chroma.from_documents(
|
28 |
documents=[docs],
|
29 |
embedding=embedding_model,
|