lamhieu commited on
Commit
b2c7d24
·
1 Parent(s): 0f24792

chore: update estimate tokens

Browse files
Files changed (1) hide show
  1. lightweight_embeddings/service.py +7 -6
lightweight_embeddings/service.py CHANGED
@@ -369,12 +369,13 @@ class EmbeddingsService:
369
  }
370
 
371
  def estimate_tokens(self, input_data: Union[str, List[str]]) -> int:
372
- """
373
- Very rough heuristic: ~4 chars per token.
374
- """
375
- texts = self._validate_text_input(input_data)
376
- total_chars = sum(len(t) for t in texts)
377
- return max(1, round(total_chars / 4))
 
378
 
379
  @staticmethod
380
  def softmax(scores: np.ndarray) -> np.ndarray:
 
369
  }
370
 
371
  def estimate_tokens(self, input_data: Union[str, List[str]]) -> int:
372
+ """
373
+ Estimate token count using the model's tokenizer.
374
+ """
375
+ texts = self._validate_text_input(input_data)
376
+ model = self.text_models[self.config.text_model_type]
377
+ tokenized = model.tokenize(texts)
378
+ return sum(len(ids) for ids in tokenized['input_ids'])
379
 
380
  @staticmethod
381
  def softmax(scores: np.ndarray) -> np.ndarray: