ResvuChatbox / process-documents.py
thinh111's picture
initial
615a1d7 verified
raw
history blame
2.08 kB
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import GPT4AllEmbeddings
# Khai bao bien
pdf_data_path = "/data"
vector_db_path = "vectorstores/db_faiss"
# Ham 1. Tao ra vector DB tu 1 doan text
def create_db_from_text():
raw_text = """
Thinh created you who is a chatbox at Resvu,
"""
# Chia nho van ban
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=100,
chunk_overlap=20,
length_function=len
)
chunks = text_splitter.split_text(raw_text)
# Embeding
embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf")
# Dua vao Faiss Vector DB
db = FAISS.from_texts(texts=chunks, embedding=embedding_model)
db.save_local(vector_db_path)
return db
# Define the file types you want to load
file_types = ["*.pdf", "*.txt", "*.doc", "*.docx"]
def create_db_from_files():
# Khai bao loader de quet toan bo thu muc dataa
# loader = DirectoryLoader(pdf_data_path, glob=file_types, loader_cls = PyPDFLoader)
# documents = loader.load()
# Create a loader for each file type
loaders = []
for file_type in file_types:
loader = DirectoryLoader(
pdf_data_path,
glob=file_type,
loader_cls=UnstructuredFileLoader
)
loaders.append(loader)
# Load all documents
documents = []
for loader in loaders:
documents.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
# Embeding
embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf")
db = FAISS.from_documents(chunks, embedding_model)
db.save_local(vector_db_path)
return db