import hashlib import urllib import uuid from pathlib import Path from typing import List, Tuple import pandas as pd from loguru import logger from app.config.models.configs import Document, Config from app.parsers.markdown import markdown_splitter HASH_BLOCKSIZE = 65536 class DocumentSplitter: def __init__(self, config: Config) -> None: self.document_path_settings = config.embeddings.document_settings self.chunk_sizes = config.embeddings.chunk_sizes def split( self, limit: int = None, ) -> Tuple[List[Document], pd.DataFrame, pd.DataFrame]: all_docs = [] hash_filename_mappings = [] hash_docid_mappings = [] for setting in self.document_path_settings: passage_prefix = setting.passage_prefix docs_path = Path(setting.doc_path) extension = "md" for chunk_size in self.chunk_sizes: paths = [p for p in list(docs_path.glob(f"**/*.{extension}"))] additional_parser_settings = setting.additional_parser_settings.get( extension, dict() ) ( docs, hf_mappings, hd_mappings, ) = self._get_documents_from_custom_splitter( document_paths=paths, splitter_func=markdown_splitter, max_size=chunk_size, passage_prefix=passage_prefix, **additional_parser_settings, ) all_docs.extend(docs) hash_filename_mappings.extend(hf_mappings) hash_docid_mappings.extend(hd_mappings) all_hash_filename_mappings = pd.DataFrame(hash_filename_mappings) all_hash_docid_mappings = pd.concat(hash_docid_mappings, axis=0) if limit: all_docs = all_docs[:limit] all_hash_filename_mappings = all_hash_filename_mappings[:limit] all_hash_docid_mappings = all_hash_docid_mappings[:limit] return all_docs, all_hash_filename_mappings, all_hash_docid_mappings def _get_documents_from_custom_splitter( self, document_paths: List[Path], splitter_func, max_size, passage_prefix: str, **additional_kwargs, ) -> Tuple[List[Document], List[dict], List[pd.DataFrame]]: all_docs = [] hash_filename_mappings = [] hash_docid_mappings = [] for path in document_paths: filepath = str(path) filename = filepath.split("/")[-1].replace(f".{path.suffix}", "") if path.suffix != ".md": continue additional_kwargs.update({"filename": filepath}) docs_data = splitter_func(path, max_size, **additional_kwargs) file_hash = get_md5_hash(path) path = urllib.parse.quote(str(path)) # type: ignore logger.info(path) docs = [ Document( page_content=passage_prefix + d["text"], metadata={ **d["metadata"], **{ "source": str(path), "chunk_size": max_size, "document_id": str(uuid.uuid1()), "label": filename, }, }, ) for d in docs_data ] for d in docs: if 'page' in d.metadata and d.metadata['page'] is None: d.metadata['page'] = -1 all_docs.extend(docs) hash_filename_mappings.append(dict(filename=filepath, filehash=file_hash)) df_hash_docid = ( pd.DataFrame() .assign(docid=[d.metadata["document_id"] for d in docs]) .assign(filehash=file_hash) ) hash_docid_mappings.append(df_hash_docid) logger.info(f"Got {len(all_docs)} nodes.") return all_docs, hash_filename_mappings, hash_docid_mappings def get_md5_hash(file_path: Path) -> str: hasher = hashlib.md5() with open(file_path, "rb") as file: buf = file.read(HASH_BLOCKSIZE) while buf: hasher.update(buf) buf = file.read(HASH_BLOCKSIZE) return hasher.hexdigest()