Spaces:
Runtime error
Runtime error
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter | |
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader | |
from langchain_community.vectorstores import FAISS | |
from langchain_community.embeddings import GPT4AllEmbeddings | |
# Khai bao bien | |
pdf_data_path = "/data" | |
vector_db_path = "vectorstores/db_faiss" | |
# Ham 1. Tao ra vector DB tu 1 doan text | |
def create_db_from_text(): | |
raw_text = """ | |
Thinh created you who is a chatbox at Resvu, | |
""" | |
# Chia nho van ban | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size=100, | |
chunk_overlap=20, | |
length_function=len | |
) | |
chunks = text_splitter.split_text(raw_text) | |
# Embeding | |
embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf") | |
# Dua vao Faiss Vector DB | |
db = FAISS.from_texts(texts=chunks, embedding=embedding_model) | |
db.save_local(vector_db_path) | |
return db | |
# Define the file types you want to load | |
file_types = ["*.pdf", "*.txt", "*.doc", "*.docx"] | |
def create_db_from_files(): | |
# Khai bao loader de quet toan bo thu muc dataa | |
# loader = DirectoryLoader(pdf_data_path, glob=file_types, loader_cls = PyPDFLoader) | |
# documents = loader.load() | |
# Create a loader for each file type | |
loaders = [] | |
for file_type in file_types: | |
loader = DirectoryLoader( | |
pdf_data_path, | |
glob=file_type, | |
loader_cls=UnstructuredFileLoader | |
) | |
loaders.append(loader) | |
# Load all documents | |
documents = [] | |
for loader in loaders: | |
documents.extend(loader.load()) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50) | |
chunks = text_splitter.split_documents(documents) | |
# Embeding | |
embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf") | |
db = FAISS.from_documents(chunks, embedding_model) | |
db.save_local(vector_db_path) | |
return db |