|
""" |
|
Indexing with vector database |
|
""" |
|
|
|
from pathlib import Path |
|
import re |
|
import chromadb |
|
from unidecode import unidecode |
|
|
|
from langchain_community.document_loaders import PyPDFLoader, TextLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_chroma import Chroma |
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
|
|
|
|
|
|
def load_doc(list_file_path, chunk_size, chunk_overlap): |
|
"""Load documents and create doc splits""" |
|
|
|
pages = [] |
|
full_text = "" |
|
for path in list_file_path: |
|
if path.endswith(".pdf"): |
|
loader = PyPDFLoader(path) |
|
elif path.endswith(".txt"): |
|
loader = TextLoader(path) |
|
else: |
|
continue |
|
doc_pages = loader.load() |
|
pages.extend(doc_pages) |
|
full_text += "\n".join([p.page_content for p in doc_pages]) + "\n" |
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=chunk_size, |
|
chunk_overlap=chunk_overlap, |
|
) |
|
doc_splits = text_splitter.split_documents(pages) |
|
return doc_splits, full_text |
|
|
|
|
|
|
|
def create_collection_name(filepath): |
|
"""Create collection name for vector database""" |
|
collection_name = Path(filepath).stem |
|
collection_name = collection_name.replace(" ", "-") |
|
collection_name = unidecode(collection_name) |
|
collection_name = re.sub("[^A-Za-z0-9]+", "-", collection_name) |
|
collection_name = collection_name[:50] |
|
if len(collection_name) < 3: |
|
collection_name = collection_name + "xyz" |
|
if not collection_name[0].isalnum(): |
|
collection_name = "A" + collection_name[1:] |
|
if not collection_name[-1].isalnum(): |
|
collection_name = collection_name[:-1] + "Z" |
|
print("\n\nFilepath: ", filepath) |
|
print("Collection name: ", collection_name) |
|
return collection_name |
|
|
|
|
|
|
|
def create_db(splits, collection_name): |
|
"""Create embeddings and vector database""" |
|
embedding = HuggingFaceEmbeddings( |
|
model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", |
|
) |
|
chromadb.api.client.SharedSystemClient.clear_system_cache() |
|
new_client = chromadb.EphemeralClient() |
|
vectordb = Chroma.from_documents( |
|
documents=splits, |
|
embedding=embedding, |
|
client=new_client, |
|
collection_name=collection_name, |
|
) |
|
return vectordb |
|
|