Spaces:
Running
Running
File size: 1,144 Bytes
640b1c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# src/utils/text_splitter.py
from typing import List
def split_text(
text: str,
chunk_size: int = 500,
overlap: int = 50
) -> List[str]:
"""
Split a long text into smaller chunks
Args:
text (str): Input text to split
chunk_size (int): Maximum size of each text chunk
overlap (int): Number of characters to overlap between chunks
Returns:
List[str]: List of text chunks
"""
chunks = []
start = 0
while start < len(text):
# Extract chunk
chunk = text[start:start + chunk_size]
chunks.append(chunk)
# Move start position with overlap
start += chunk_size - overlap
return chunks
def clean_text(text: str) -> str:
"""
Clean and preprocess text
Args:
text (str): Input text to clean
Returns:
str: Cleaned text
"""
# Remove extra whitespaces
text = ' '.join(text.split())
# Add more cleaning steps as needed
# For example:
# - Remove special characters
# - Convert to lowercase
# - Remove HTML tags
return text |