HoangNB
Add embedding service and preprocessor; integrate with Gradio interface
133f1d4
import re
from transformers import AutoTokenizer
from app.config import EMBEDDING_MODEL
class TextPreprocessor:
"""
A simple text preprocessor for cleaning and tokenizing text.
"""
def __init__(self, model_name: str = EMBEDDING_MODEL):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
def clean_text(self, text: str) -> str:
"""
Remove extra whitespace and control characters from text.
Args:
text: The text to clean.
Returns:
The cleaned text.
"""
text = re.sub(r"[\s\t\n]+", " ", text) # Normalize whitespace
text = re.sub(r"[\x00-\x1F\x7F]", "", text) # Remove control characters
return text.strip()
def count_tokens(self, text: str) -> int:
"""
Count the number of tokens in the text using a tokenizer.
Args:
text: The text to tokenize.
Returns:
The number of tokens.
"""
# Tokenize the text and return the length of the input IDs
return len(self.tokenizer(text).input_ids)