Spaces:
Sleeping
Sleeping
import re | |
from transformers import AutoTokenizer | |
from app.config import EMBEDDING_MODEL | |
class TextPreprocessor: | |
""" | |
A simple text preprocessor for cleaning and tokenizing text. | |
""" | |
def __init__(self, model_name: str = EMBEDDING_MODEL): | |
self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def clean_text(self, text: str) -> str: | |
""" | |
Remove extra whitespace and control characters from text. | |
Args: | |
text: The text to clean. | |
Returns: | |
The cleaned text. | |
""" | |
text = re.sub(r"[\s\t\n]+", " ", text) # Normalize whitespace | |
text = re.sub(r"[\x00-\x1F\x7F]", "", text) # Remove control characters | |
return text.strip() | |
def count_tokens(self, text: str) -> int: | |
""" | |
Count the number of tokens in the text using a tokenizer. | |
Args: | |
text: The text to tokenize. | |
Returns: | |
The number of tokens. | |
""" | |
# Tokenize the text and return the length of the input IDs | |
return len(self.tokenizer(text).input_ids) |