File size: 1,117 Bytes
341b0e8 db2d027 f437f2a 30b8a93 f437f2a 6085a4e 30b8a93 f437f2a 1d55d4a 73e234f 51c6493 f437f2a eb40503 f437f2a 30b8a93 f437f2a 229c387 51c6493 f437f2a 341b0e8 f437f2a 969e642 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
from langchain_community.document_loaders import PyPDFLoader
from datasets import load_dataset
dataset = load_dataset("Namitg02/Test")
print(dataset)
from langchain.docstore.document import Document as LangchainDocument
RAW_KNOWLEDGE_BASE = [
LangchainDocument(page_content=doc["dataset"], metadata={"one": doc["two"]})
]
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=15,separators=["\n\n", "\n", " ", ""])
docs = splitter.split_documents(RAW_KNOWLEDGE_BASE)
#docs = splitter.split_text(str(dataset))
from langchain_community.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# embeddings = embedding_model.encode(docs)
from langchain_community.vectorstores import Chroma
persist_directory = 'docs/chroma/'
vectordb = Chroma.from_documents(
documents=[docs],
embedding=embedding_model,
persist_directory=persist_directory
)
retriever = vectordb.as_retriever()
import gradio as gr
gr.load("models/HuggingFaceH4/zephyr-7b-beta").launch() |