chatbot-backend / src /utils /text_splitter.py
TalatMasood's picture
initial commit
640b1c8
raw
history blame
1.14 kB
# src/utils/text_splitter.py
from typing import List
def split_text(
text: str,
chunk_size: int = 500,
overlap: int = 50
) -> List[str]:
"""
Split a long text into smaller chunks
Args:
text (str): Input text to split
chunk_size (int): Maximum size of each text chunk
overlap (int): Number of characters to overlap between chunks
Returns:
List[str]: List of text chunks
"""
chunks = []
start = 0
while start < len(text):
# Extract chunk
chunk = text[start:start + chunk_size]
chunks.append(chunk)
# Move start position with overlap
start += chunk_size - overlap
return chunks
def clean_text(text: str) -> str:
"""
Clean and preprocess text
Args:
text (str): Input text to clean
Returns:
str: Cleaned text
"""
# Remove extra whitespaces
text = ' '.join(text.split())
# Add more cleaning steps as needed
# For example:
# - Remove special characters
# - Convert to lowercase
# - Remove HTML tags
return text