import re from transformers import AutoTokenizer from app.config import EMBEDDING_MODEL class TextPreprocessor: """ A simple text preprocessor for cleaning and tokenizing text. """ def __init__(self, model_name: str = EMBEDDING_MODEL): self.tokenizer = AutoTokenizer.from_pretrained(model_name) def clean_text(self, text: str) -> str: """ Remove extra whitespace and control characters from text. Args: text: The text to clean. Returns: The cleaned text. """ text = re.sub(r"[\s\t\n]+", " ", text) # Normalize whitespace text = re.sub(r"[\x00-\x1F\x7F]", "", text) # Remove control characters return text.strip() def count_tokens(self, text: str) -> int: """ Count the number of tokens in the text using a tokenizer. Args: text: The text to tokenize. Returns: The number of tokens. """ # Tokenize the text and return the length of the input IDs return len(self.tokenizer(text).input_ids)