Spaces:
Running
Running
# src/utils/text_splitter.py | |
from typing import List | |
def split_text( | |
text: str, | |
chunk_size: int = 500, | |
overlap: int = 50 | |
) -> List[str]: | |
""" | |
Split a long text into smaller chunks | |
Args: | |
text (str): Input text to split | |
chunk_size (int): Maximum size of each text chunk | |
overlap (int): Number of characters to overlap between chunks | |
Returns: | |
List[str]: List of text chunks | |
""" | |
chunks = [] | |
start = 0 | |
while start < len(text): | |
# Extract chunk | |
chunk = text[start:start + chunk_size] | |
chunks.append(chunk) | |
# Move start position with overlap | |
start += chunk_size - overlap | |
return chunks | |
def clean_text(text: str) -> str: | |
""" | |
Clean and preprocess text | |
Args: | |
text (str): Input text to clean | |
Returns: | |
str: Cleaned text | |
""" | |
# Remove extra whitespaces | |
text = ' '.join(text.split()) | |
# Add more cleaning steps as needed | |
# For example: | |
# - Remove special characters | |
# - Convert to lowercase | |
# - Remove HTML tags | |
return text |