import re from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter from llm_engineering.application.networks import EmbeddingModelSingleton embedding_model = EmbeddingModelSingleton() def chunk_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> list[str]: character_splitter = RecursiveCharacterTextSplitter(separators=["\n\n"], chunk_size=chunk_size, chunk_overlap=0) text_split_by_characters = character_splitter.split_text(text) token_splitter = SentenceTransformersTokenTextSplitter( chunk_overlap=chunk_overlap, tokens_per_chunk=embedding_model.max_input_length, model_name=embedding_model.model_id, ) chunks_by_tokens = [] for section in text_split_by_characters: chunks_by_tokens.extend(token_splitter.split_text(section)) return chunks_by_tokens def chunk_document(text: str, min_length: int, max_length: int) -> list[str]: """Alias for chunk_article().""" return chunk_article(text, min_length, max_length) def chunk_article(text: str, min_length: int, max_length: int) -> list[str]: sentences = re.split(r"(?= min_length: extracts.append(current_chunk.strip()) current_chunk = sentence + " " if len(current_chunk) >= min_length: extracts.append(current_chunk.strip()) return extracts