presidio commited on
Commit
3afd122
·
1 Parent(s): 547518c

Upload 3 files

Browse files
transformers_rec/transformers_recognizer.py CHANGED
@@ -224,14 +224,18 @@ class TransformersRecognizer(EntityRecognizer):
224
  model_max_length = self.pipeline.tokenizer.model_max_length
225
  # calculate inputs based on the text
226
  text_length = len(text)
227
- # split text into chunks
228
- logger.info(
229
- f"splitting the text into chunks, length {text_length} > {model_max_length*2}"
230
- )
231
  predictions = list()
232
- chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
233
- text_length, self.chunk_length, self.text_overlap_length
234
- )
 
 
 
 
 
 
 
 
235
 
236
  # iterate over text chunks and run inference
237
  for chunk_start, chunk_end in chunk_indexes:
 
224
  model_max_length = self.pipeline.tokenizer.model_max_length
225
  # calculate inputs based on the text
226
  text_length = len(text)
 
 
 
 
227
  predictions = list()
228
+ if text_length > model_max_length*2:
229
+ # split text into chunks
230
+ logger.info(
231
+ f"splitting the text into chunks, length {text_length} > {model_max_length*2}"
232
+ )
233
+
234
+ chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
235
+ text_length, self.chunk_length, self.text_overlap_length
236
+ )
237
+ else:
238
+ chunk_indexes = [[0, text_length]]
239
 
240
  # iterate over text chunks and run inference
241
  for chunk_start, chunk_end in chunk_indexes: