# src/utils/text_splitter.py from typing import List def split_text( text: str, chunk_size: int = 500, overlap: int = 50 ) -> List[str]: """ Split a long text into smaller chunks Args: text (str): Input text to split chunk_size (int): Maximum size of each text chunk overlap (int): Number of characters to overlap between chunks Returns: List[str]: List of text chunks """ chunks = [] start = 0 while start < len(text): # Extract chunk chunk = text[start:start + chunk_size] chunks.append(chunk) # Move start position with overlap start += chunk_size - overlap return chunks def clean_text(text: str) -> str: """ Clean and preprocess text Args: text (str): Input text to clean Returns: str: Cleaned text """ # Remove extra whitespaces text = ' '.join(text.split()) # Add more cleaning steps as needed # For example: # - Remove special characters # - Convert to lowercase # - Remove HTML tags return text