File size: 1,117 Bytes
341b0e8
 
 
 
db2d027
f437f2a
30b8a93
 
 
 
 
 
f437f2a
6085a4e
30b8a93
 
f437f2a
 
1d55d4a
73e234f
51c6493
f437f2a
 
eb40503
f437f2a
30b8a93
f437f2a
229c387
51c6493
f437f2a
 
 
 
 
341b0e8
f437f2a
969e642
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from langchain_community.document_loaders import PyPDFLoader

from datasets import load_dataset
dataset = load_dataset("Namitg02/Test")
print(dataset)

from langchain.docstore.document import Document as LangchainDocument

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc["dataset"], metadata={"one": doc["two"]})
]

from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=15,separators=["\n\n", "\n", " ", ""])
docs = splitter.split_documents(RAW_KNOWLEDGE_BASE)
#docs = splitter.split_text(str(dataset))


from langchain_community.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# embeddings = embedding_model.encode(docs)


from langchain_community.vectorstores import Chroma
persist_directory = 'docs/chroma/'

vectordb = Chroma.from_documents(
    documents=[docs],
    embedding=embedding_model,
    persist_directory=persist_directory
)


retriever = vectordb.as_retriever()

import gradio as gr
gr.load("models/HuggingFaceH4/zephyr-7b-beta").launch()