Spaces:
Sleeping
Sleeping
File size: 1,098 Bytes
133f1d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import re
from transformers import AutoTokenizer
from app.config import EMBEDDING_MODEL
class TextPreprocessor:
"""
A simple text preprocessor for cleaning and tokenizing text.
"""
def __init__(self, model_name: str = EMBEDDING_MODEL):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
def clean_text(self, text: str) -> str:
"""
Remove extra whitespace and control characters from text.
Args:
text: The text to clean.
Returns:
The cleaned text.
"""
text = re.sub(r"[\s\t\n]+", " ", text) # Normalize whitespace
text = re.sub(r"[\x00-\x1F\x7F]", "", text) # Remove control characters
return text.strip()
def count_tokens(self, text: str) -> int:
"""
Count the number of tokens in the text using a tokenizer.
Args:
text: The text to tokenize.
Returns:
The number of tokens.
"""
# Tokenize the text and return the length of the input IDs
return len(self.tokenizer(text).input_ids) |