File size: 2,433 Bytes
064caab 3ddcd56 064caab 3ddcd56 064caab 3ddcd56 064caab 3ddcd56 064caab 3ddcd56 064caab 3ddcd56 064caab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
"""
Indexing with vector database
"""
from pathlib import Path
import re
import chromadb
from unidecode import unidecode
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
# Load PDF or TXT document and create doc splits
def load_doc(list_file_path, chunk_size, chunk_overlap):
"""Load documents and create doc splits"""
pages = []
full_text = ""
for path in list_file_path:
if path.endswith(".pdf"):
loader = PyPDFLoader(path)
elif path.endswith(".txt"):
loader = TextLoader(path)
else:
continue
doc_pages = loader.load()
pages.extend(doc_pages)
full_text += "\n".join([p.page_content for p in doc_pages]) + "\n"
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
doc_splits = text_splitter.split_documents(pages)
return doc_splits, full_text
# Generate collection name for vector database
def create_collection_name(filepath):
"""Create collection name for vector database"""
collection_name = Path(filepath).stem
collection_name = collection_name.replace(" ", "-")
collection_name = unidecode(collection_name)
collection_name = re.sub("[^A-Za-z0-9]+", "-", collection_name)
collection_name = collection_name[:50]
if len(collection_name) < 3:
collection_name = collection_name + "xyz"
if not collection_name[0].isalnum():
collection_name = "A" + collection_name[1:]
if not collection_name[-1].isalnum():
collection_name = collection_name[:-1] + "Z"
print("\n\nFilepath: ", filepath)
print("Collection name: ", collection_name)
return collection_name
# Create vector database
def create_db(splits, collection_name):
"""Create embeddings and vector database"""
embedding = HuggingFaceEmbeddings(
model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
)
chromadb.api.client.SharedSystemClient.clear_system_cache()
new_client = chromadb.EphemeralClient()
vectordb = Chroma.from_documents(
documents=splits,
embedding=embedding,
client=new_client,
collection_name=collection_name,
)
return vectordb
|