|
""" |
|
Indexing with vector database |
|
""" |
|
|
|
from pathlib import Path |
|
import re |
|
|
|
import chromadb |
|
|
|
from unidecode import unidecode |
|
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_chroma import Chroma |
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
|
|
|
|
|
|
|
|
def load_doc(list_file_path, chunk_size, chunk_overlap): |
|
"""Load PDF document and create doc splits""" |
|
|
|
loaders = [PyPDFLoader(x) for x in list_file_path] |
|
pages = [] |
|
for loader in loaders: |
|
pages.extend(loader.load()) |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap |
|
) |
|
doc_splits = text_splitter.split_documents(pages) |
|
return doc_splits |
|
|
|
|
|
|
|
|
|
|
|
def create_collection_name(filepath): |
|
"""Create collection name for vector database""" |
|
|
|
|
|
collection_name = Path(filepath).stem |
|
|
|
|
|
collection_name = collection_name.replace(" ", "-") |
|
|
|
collection_name = unidecode(collection_name) |
|
|
|
collection_name = re.sub("[^A-Za-z0-9]+", "-", collection_name) |
|
|
|
collection_name = collection_name[:50] |
|
|
|
if len(collection_name) < 3: |
|
collection_name = collection_name + "xyz" |
|
|
|
if not collection_name[0].isalnum(): |
|
collection_name = "A" + collection_name[1:] |
|
if not collection_name[-1].isalnum(): |
|
collection_name = collection_name[:-1] + "Z" |
|
print("\n\nFilepath: ", filepath) |
|
print("Collection name: ", collection_name) |
|
return collection_name |
|
|
|
|
|
|
|
def create_db(splits, collection_name): |
|
"""Create embeddings and vector database""" |
|
|
|
embedding = HuggingFaceEmbeddings( |
|
model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", |
|
|
|
|
|
|
|
) |
|
chromadb.api.client.SharedSystemClient.clear_system_cache() |
|
new_client = chromadb.EphemeralClient() |
|
vectordb = Chroma.from_documents( |
|
documents=splits, |
|
embedding=embedding, |
|
client=new_client, |
|
collection_name=collection_name, |
|
|
|
) |
|
return vectordb |
|
|