|
|
|
import os |
|
import glob |
|
from typing import List |
|
from dotenv import load_dotenv |
|
from multiprocessing import Pool |
|
from tqdm import tqdm |
|
from langchain_cohere import CohereEmbeddings |
|
from langchain.document_loaders import ( |
|
CSVLoader, |
|
EverNoteLoader, |
|
PyMuPDFLoader, |
|
TextLoader, |
|
UnstructuredEmailLoader, |
|
UnstructuredEPubLoader, |
|
UnstructuredHTMLLoader, |
|
UnstructuredMarkdownLoader, |
|
UnstructuredODTLoader, |
|
UnstructuredPowerPointLoader, |
|
UnstructuredWordDocumentLoader, |
|
) |
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.vectorstores import Chroma |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.docstore.document import Document |
|
|
|
if not load_dotenv(): |
|
print("Could not load .env file or it is empty. Please check if it exists and is readable.") |
|
exit(1) |
|
|
|
from constants import CHROMA_SETTINGS |
|
import chromadb |
|
from chromadb.api.segment import API |
|
|
|
|
|
persist_directory = os.environ.get('PERSIST_DIRECTORY') |
|
source_directory = os.environ.get('SOURCE_DIRECTORY', 'source_documents') |
|
embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME') |
|
chunk_size = 500 |
|
chunk_overlap = 50 |
|
|
|
|
|
|
|
class MyElmLoader(UnstructuredEmailLoader): |
|
"""Wrapper to fallback to text/plain when default does not work""" |
|
|
|
def load(self) -> List[Document]: |
|
"""Wrapper adding fallback for elm without html""" |
|
try: |
|
try: |
|
doc = UnstructuredEmailLoader.load(self) |
|
except ValueError as e: |
|
if 'text/html content not found in email' in str(e): |
|
|
|
self.unstructured_kwargs["content_source"]="text/plain" |
|
doc = UnstructuredEmailLoader.load(self) |
|
else: |
|
raise |
|
except Exception as e: |
|
|
|
raise type(e)(f"{self.file_path}: {e}") from e |
|
|
|
return doc |
|
|
|
|
|
|
|
LOADER_MAPPING = { |
|
".csv": (CSVLoader, {}), |
|
|
|
".doc": (UnstructuredWordDocumentLoader, {}), |
|
".docx": (UnstructuredWordDocumentLoader, {}), |
|
".enex": (EverNoteLoader, {}), |
|
".eml": (MyElmLoader, {}), |
|
".epub": (UnstructuredEPubLoader, {}), |
|
".html": (UnstructuredHTMLLoader, {}), |
|
".md": (UnstructuredMarkdownLoader, {}), |
|
".odt": (UnstructuredODTLoader, {}), |
|
".pdf": (PyMuPDFLoader, {}), |
|
".ppt": (UnstructuredPowerPointLoader, {}), |
|
".pptx": (UnstructuredPowerPointLoader, {}), |
|
".txt": (TextLoader, {"encoding": "utf8"}), |
|
|
|
} |
|
|
|
|
|
def load_single_document(file_path: str) -> List[Document]: |
|
ext = "." + file_path.rsplit(".", 1)[-1].lower() |
|
if ext in LOADER_MAPPING: |
|
loader_class, loader_args = LOADER_MAPPING[ext] |
|
loader = loader_class(file_path, **loader_args) |
|
return loader.load() |
|
|
|
raise ValueError(f"Unsupported file extension '{ext}'") |
|
|
|
def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]: |
|
""" |
|
Loads all documents from the source documents directory, ignoring specified files |
|
""" |
|
all_files = [] |
|
for ext in LOADER_MAPPING: |
|
all_files.extend( |
|
glob.glob(os.path.join(source_dir, f"**/*{ext.lower()}"), recursive=True) |
|
) |
|
all_files.extend( |
|
glob.glob(os.path.join(source_dir, f"**/*{ext.upper()}"), recursive=True) |
|
) |
|
filtered_files = [file_path for file_path in all_files if file_path not in ignored_files] |
|
|
|
with Pool(processes=os.cpu_count()) as pool: |
|
results = [] |
|
with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar: |
|
for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)): |
|
results.extend(docs) |
|
pbar.update() |
|
|
|
return results |
|
|
|
def process_documents(ignored_files: List[str] = []) -> List[Document]: |
|
""" |
|
Load documents and split in chunks |
|
""" |
|
print(f"Loading documents from {source_directory}") |
|
documents = load_documents(source_directory, ignored_files) |
|
if not documents: |
|
print("No new documents to load") |
|
exit(0) |
|
print(f"Loaded {len(documents)} new documents from {source_directory}") |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) |
|
documents = text_splitter.split_documents(documents) |
|
print(f"Split into {len(documents)} chunks of text (max. {chunk_size} tokens each)") |
|
return documents |
|
|
|
def batch_chromadb_insertions(chroma_client: API, documents: List[Document]) -> List[Document]: |
|
""" |
|
Split the total documents to be inserted into batches of documents that the local chroma client can process |
|
""" |
|
|
|
max_batch_size = chroma_client.max_batch_size |
|
for i in range(0, len(documents), max_batch_size): |
|
yield documents[i:i + max_batch_size] |
|
|
|
|
|
def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool: |
|
""" |
|
Checks if vectorstore exists |
|
""" |
|
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings) |
|
if not db.get()['documents']: |
|
return False |
|
return True |
|
|
|
def main(): |
|
|
|
|
|
embeddings = CohereEmbeddings() |
|
|
|
chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory) |
|
|
|
if does_vectorstore_exist(persist_directory, embeddings): |
|
|
|
print(f"Appending to existing vectorstore at {persist_directory}") |
|
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client) |
|
collection = db.get() |
|
documents = process_documents([metadata['source'] for metadata in collection['metadatas']]) |
|
print(f"Creating embeddings. May take some minutes...") |
|
for batched_chromadb_insertion in batch_chromadb_insertions(chroma_client, documents): |
|
db.add_documents(batched_chromadb_insertion) |
|
else: |
|
|
|
print("Creating new vectorstore") |
|
documents = process_documents() |
|
print(f"Creating embeddings. May take some minutes...") |
|
|
|
batched_chromadb_insertions = batch_chromadb_insertions(chroma_client, documents) |
|
first_insertion = next(batched_chromadb_insertions) |
|
db = Chroma.from_documents(first_insertion, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client) |
|
|
|
for batched_chromadb_insertion in batched_chromadb_insertions: |
|
db.add_documents(batched_chromadb_insertion) |
|
|
|
print(f"Ingestion complete! You can now run privateGPT.py to query your documents") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|