Spaces:

hoangnb
/

vietnamese-bi-encoder

Sleeping

vietnamese-bi-encoder / app /services /preprocessor.py

HoangNB

Add embedding service and preprocessor; integrate with Gradio interface

133f1d4 4 months ago

1.1 kB

	import re
	from transformers import AutoTokenizer
	from app.config import EMBEDDING_MODEL

	class TextPreprocessor:
	"""
	A simple text preprocessor for cleaning and tokenizing text.
	"""

	def __init__(self, model_name: str = EMBEDDING_MODEL):
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)

	def clean_text(self, text: str) -> str:
	"""
	Remove extra whitespace and control characters from text.

	Args:
	text: The text to clean.

	Returns:
	The cleaned text.
	"""
	text = re.sub(r"[\s\t\n]+", " ", text) # Normalize whitespace
	text = re.sub(r"[\x00-\x1F\x7F]", "", text) # Remove control characters
	return text.strip()

	def count_tokens(self, text: str) -> int:
	"""
	Count the number of tokens in the text using a tokenizer.

	Args:
	text: The text to tokenize.

	Returns:
	The number of tokens.
	"""
	# Tokenize the text and return the length of the input IDs
	return len(self.tokenizer(text).input_ids)