DawnC commited on
Commit
0f19e77
·
verified ·
1 Parent(s): 59e2bcf

Update response_processor.py

Browse files
Files changed (1) hide show
  1. response_processor.py +117 -239
response_processor.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import re
2
  import logging
3
  import traceback
@@ -195,8 +196,8 @@ class ResponseProcessor:
195
  raise ResponseProcessingError("Empty response provided for cleaning")
196
 
197
  try:
198
- # 調試:記錄清理前的原始回應
199
- self.logger.info(f"DEBUG: Response before cleaning: {response}")
200
 
201
  self.logger.debug(f"Starting response cleaning (original length: {len(response)})")
202
 
@@ -209,8 +210,8 @@ class ResponseProcessor:
209
  else:
210
  cleaned_response = self._clean_general_response(response)
211
 
212
- # 調試:記錄清理後的回應
213
- self.logger.info(f"DEBUG: Response after cleaning: {cleaned_response}")
214
 
215
  # 如果清理後內容過短,嘗試從原始回應中恢復
216
  if len(cleaned_response.strip()) < 40:
@@ -315,80 +316,20 @@ class ResponseProcessor:
315
  def _critical_format_preprocess(self, response: str) -> str:
316
  """
317
  關鍵格式預處理,處理最常見的格式問題
318
-
319
  Args:
320
  response: 原始回應
321
-
322
  Returns:
323
  str: 預處理後的回應
324
  """
325
  if not response:
326
  return response
327
-
328
  try:
329
  import re
330
- # 移除各種形式的 confirmed
331
- confirmed_patterns = [
332
- r'\bconfirmed\s+', # "confirmed cars" -> "cars"
333
- r'\b(\d+)\s+confirmed\s+([a-zA-Z\s]+)', # "12 confirmed cars" -> "12 cars"
334
- r'\b(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s+confirmed\s+([a-zA-Z\s]+)', # "twelve confirmed cars" -> "twelve cars"
335
- ]
336
-
337
- for pattern in confirmed_patterns:
338
- if pattern == r'\bconfirmed\s+':
339
- response = re.sub(pattern, '', response, flags=re.IGNORECASE)
340
- else:
341
- response = re.sub(pattern, r'\1 \2', response, flags=re.IGNORECASE)
342
-
343
- # 數字轉文字的完整字典
344
- number_conversions = {
345
- '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five',
346
- '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten',
347
- '11': 'eleven', '12': 'twelve', '13': 'thirteen', '14': 'fourteen', '15': 'fifteen',
348
- '16': 'sixteen', '17': 'seventeen', '18': 'eighteen', '19': 'nineteen', '20': 'twenty'
349
- }
350
-
351
- # 強化數字替換邏輯 - 處理各種語法結構
352
- for digit, word in number_conversions.items():
353
- # 模式1: 數字 + 名詞 (如 "3 cars", "12 people")
354
- pattern1 = rf'\b{digit}\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\b'
355
- response = re.sub(pattern1, rf'{word} \1', response)
356
-
357
- # 模式2: 數字 + visible/present + 名詞 (如 "3 visible traffic lights")
358
- pattern2 = rf'\b{digit}\s+(visible|present|apparent|evident)\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\b'
359
- response = re.sub(pattern2, rf'{word} \1 \2', response, flags=re.IGNORECASE)
360
-
361
- # 模式3: 介詞 + 數字 + 名詞 (如 "against a backdrop of 3 visible traffic lights")
362
- pattern3 = rf'\b(against|with|featuring|including|containing)\s+(?:a\s+backdrop\s+of\s+)?{digit}\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\b'
363
- response = re.sub(pattern3, rf'\1 {word} \2', response, flags=re.IGNORECASE)
364
-
365
- # 模式4: 複合描述中的數字 (如 "featuring twelve confirmed cars and 3 confirmed persons")
366
- pattern4 = rf'\b(and|,)\s+{digit}\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\b'
367
- response = re.sub(pattern4, rf'\1 {word} \2', response, flags=re.IGNORECASE)
368
-
369
- grammar_fixes = [
370
- # persons -> people 的全面修正
371
- (r'\b(\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty)\s+persons\b', r'\1 people'),
372
- (r'\bmultiple\s+persons\b', 'multiple people'),
373
- (r'\bseveral\s+persons\b', 'several people'),
374
- (r'\bmany\s+persons\b', 'many people'),
375
- (r'\ba\s+few\s+persons\b', 'a few people'),
376
- (r'\bsome\s+persons\b', 'some people'),
377
- (r'\bvarious\s+persons\b', 'various people'),
378
- (r'\bnumerous\s+persons\b', 'numerous people'),
379
-
380
- # 修正語法結構問題
381
- (r'\bvisible\s+traffic\s+lights\b', 'traffic lights visible'),
382
- (r'\bpresent\s+traffic\s+lights\b', 'traffic lights present'),
383
- (r'\bapparent\s+traffic\s+lights\b', 'traffic lights apparent'),
384
-
385
- # 修正重複的形容詞結構
386
- (r'\b(visible|present|apparent|evident)\s+(visible|present|apparent|evident)\s+', r'\1 '),
387
- ]
388
-
389
- for pattern, replacement in grammar_fixes:
390
- response = re.sub(pattern, replacement, response, flags=re.IGNORECASE)
391
 
 
392
  # 首先處理已知的斜線組合,使用形容詞替換
393
  for slash_combo, replacement in self.slash_replacements.items():
394
  if slash_combo.lower() in response.lower():
@@ -399,11 +340,13 @@ class ResponseProcessor:
399
  replacement_formatted = replacement.title()
400
  else:
401
  replacement_formatted = replacement
402
-
403
  # 執行替換(不區分大小寫)
404
  response = re.sub(re.escape(slash_combo), replacement_formatted, response, flags=re.IGNORECASE)
405
-
 
406
  # 處理其他未預定義的斜線模式
 
407
  slash_pattern = r'\b([a-zA-Z]+)/([a-zA-Z]+)\b'
408
  matches = list(re.finditer(slash_pattern, response))
409
  for match in reversed(matches): # 從後往前處理避免位置偏移
@@ -414,19 +357,22 @@ class ResponseProcessor:
414
  else:
415
  replacement = word2
416
  response = response[:match.start()] + replacement + response[match.end():]
417
-
 
 
418
  # 首先處理已知的底線組合
419
  for underscore_combo, replacement in self.underscore_replacements.items():
420
  if underscore_combo in response:
421
  response = response.replace(underscore_combo, replacement)
422
-
 
423
  # 處理三個詞的底線組合:word_word_word → word word word
424
  response = re.sub(r'\b([a-z]+)_([a-z]+)_([a-z]+)\b', r'\1 \2 \3', response)
425
-
426
  # 處理任何剩餘的底線模式:word_word → word word
427
  response = re.sub(r'\b([a-zA-Z]+)_([a-zA-Z]+)\b', r'\1 \2', response)
428
-
429
- # 確保句子的完整性
430
  incomplete_sentence_fixes = [
431
  (r'\bIn\s*,\s*', 'Throughout the area, '),
432
  (r'\bOverall,\s+exudes\b', 'Overall, the scene exudes'),
@@ -434,15 +380,35 @@ class ResponseProcessor:
434
  (r'\bwith its lights turned illuminating\b', 'with its lights illuminating'),
435
  (r'\bwhere it stands as\b', 'where it stands as'),
436
  ]
437
-
438
  for pattern, replacement in incomplete_sentence_fixes:
439
  response = re.sub(pattern, replacement, response, flags=re.IGNORECASE)
440
-
441
- # 清理多餘空格並確保格式一致性
442
- response = re.sub(r'\s+', ' ', response).strip()
443
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  return response
445
-
446
  except Exception as e:
447
  self.logger.warning(f"Error in critical format preprocessing: {str(e)}")
448
  return response
@@ -669,88 +635,50 @@ class ResponseProcessor:
669
  return response # 發生錯誤時返回原始回應
670
 
671
  def _handle_repetitive_vocabulary(self, response: str) -> str:
672
- """處理重複詞彙,使用改進的檢測和替換機制"""
673
  try:
674
- # 先進行重複模式���測(記錄但不直接處理)
675
  if hasattr(self, 'repetitive_patterns'):
676
  for pattern, issue in self.repetitive_patterns:
677
- matches = list(re.finditer(pattern, response, re.IGNORECASE | re.DOTALL))
678
- if matches:
679
  self.logger.warning(f"Text quality issue detected: {issue} in response: \"{response[:100]}...\"")
680
-
681
  if not hasattr(self, 'replacement_alternatives') or not self.replacement_alternatives:
682
  return response
683
-
684
  processed_response = response
685
-
686
- # 強化的重複詞彙處理
687
  for word_to_replace, alternatives in self.replacement_alternatives.items():
688
- if not alternatives:
689
  continue
690
-
691
- # 創建更精確的詞彙匹配模式
692
- word_pattern = re.compile(r'\b' + re.escape(word_to_replace) + r'\b', re.IGNORECASE)
693
- matches = list(word_pattern.finditer(processed_response))
694
-
695
- if len(matches) <= 1:
696
- continue # 如果只出現一次或沒有出現,跳過
697
-
698
- # 對於多次出現的情況,進行智能替換
699
- replacement_count = 0
700
- alternative_index = 0
701
-
702
- def smart_replacer(match_obj):
703
- nonlocal replacement_count, alternative_index
704
- replacement_count += 1
705
- original_word = match_obj.group(0)
706
-
707
- # 第一次出現保持原樣,後續出現進行替換
708
- if replacement_count == 1:
709
- return original_word
710
-
711
- # 選擇適當的替代詞
712
- replacement = alternatives[alternative_index % len(alternatives)]
713
- alternative_index += 1
714
-
715
- # 保持原始大小寫格式
716
- if original_word.isupper():
717
- return replacement.upper()
718
- elif original_word.istitle():
719
- return replacement.capitalize()
720
- return replacement
721
-
722
- processed_response = word_pattern.sub(smart_replacer, processed_response)
723
-
724
- # === 新增:專門處理 "positioned" 的特殊邏輯 ===
725
- # 由於 "positioned" 經常出現問題,給予特別處理
726
- positioned_pattern = r'\b(positioned)\b'
727
- positioned_matches = re.findall(positioned_pattern, processed_response, re.IGNORECASE)
728
-
729
- if len(positioned_matches) > 1:
730
- # 替換除了第一個以外的所有 "positioned"
731
- positioned_alternatives = ['arranged', 'placed', 'set', 'located', 'situated']
732
- replacement_counter = 0
733
-
734
- def positioned_replacer(match):
735
- nonlocal replacement_counter
736
- if replacement_counter == 0:
737
- replacement_counter += 1
738
- return match.group(0) # 保持第一個不變
739
- else:
740
- alt_index = (replacement_counter - 1) % len(positioned_alternatives)
741
- replacement_counter += 1
742
- original = match.group(0)
743
- new_word = positioned_alternatives[alt_index]
744
-
745
- # 保持大小寫格式
746
- if original.isupper():
747
- return new_word.upper()
748
- elif original.istitle():
749
- return new_word.capitalize()
750
- return new_word
751
-
752
- processed_response = re.sub(positioned_pattern, positioned_replacer, processed_response, flags=re.IGNORECASE)
753
-
754
  # 移除 identical 等重複性描述詞彙
755
  identical_cleanup_patterns = [
756
  (r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
@@ -760,35 +688,41 @@ class ResponseProcessor:
760
  (r'\bcomprehensive view featuring\b', 'scene featuring'),
761
  (r'\bcomprehensive display of\b', 'display of'),
762
  ]
763
-
764
  for pattern, replacement in identical_cleanup_patterns:
765
  processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)
766
-
767
- # 數字到文字轉換(保持原有邏輯)
768
  number_conversions = {
769
  '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six',
770
  '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten',
771
  '11': 'eleven', '12': 'twelve'
772
  }
773
-
 
774
  for digit, word in number_conversions.items():
775
- # 各種數字模式的處理
776
- patterns_to_fix = [
777
- (rf'\b{digit}\s+([a-zA-Z]+s)\b', rf'{word} \1'),
778
- (rf'\b{digit}\s+(more|additional|other|identical)\s+([a-zA-Z]+s)\b', rf'{word} \1 \2'),
779
- (rf'\b{digit}\s+([a-zA-Z]+)\s+([a-zA-Z]+s)\b', rf'{word} \1 \2'),
780
- (rf'\b(around|approximately|about)\s+{digit}\s+([a-zA-Z]+s)\b', rf'\1 {word} \2'),
781
- ]
782
-
783
- for pattern, replacement in patterns_to_fix:
784
- processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)
785
-
 
 
 
 
 
786
  return processed_response
787
-
788
  except Exception as e:
789
  self.logger.error(f"Error in _handle_repetitive_vocabulary: {str(e)}")
790
  self.logger.error(traceback.format_exc())
791
- return response
792
 
793
  def _ensure_grammatical_completeness(self, response: str) -> str:
794
  """
@@ -1055,75 +989,19 @@ class ResponseProcessor:
1055
  return response.strip()
1056
 
1057
  def _control_word_length(self, response: str) -> str:
1058
- """控制文字長度在合理範圍內,確保句子完整性"""
1059
  words = response.split()
1060
-
1061
- # 提高基礎限制,給予更多彈性
1062
- base_limit = 220
1063
- extended_limit = 250
1064
-
1065
- if len(words) <= base_limit:
1066
- return response
1067
-
1068
- # 首先嘗試在基礎限制內找到完整句子
1069
- truncated = ' '.join(words[:base_limit])
1070
- last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
1071
-
1072
- # 如果在基礎限制內找到了適當的句子結尾
1073
- if last_period > len(truncated) * 0.8: # 確保截斷點不會太早
1074
- result = truncated[:last_period+1]
1075
- self.logger.info(f"Text truncated at {base_limit} words with proper sentence ending")
1076
- return result
1077
-
1078
- # 如果基礎限制內沒有找到合適結尾,擴展搜尋範圍
1079
- if len(words) > extended_limit:
1080
- extended_truncated = ' '.join(words[:extended_limit])
1081
- extended_last_period = max(
1082
- extended_truncated.rfind('.'),
1083
- extended_truncated.rfind('!'),
1084
- extended_truncated.rfind('?')
1085
- )
1086
-
1087
- # 在擴展範圍內找到合適的結尾
1088
- if extended_last_period > len(extended_truncated) * 0.7:
1089
- result = extended_truncated[:extended_last_period+1]
1090
- self.logger.info(f"Text truncated at extended limit with proper sentence ending")
1091
- return result
1092
-
1093
- # 如果仍然找不到合適的結尾,使用智能截斷
1094
- # 尋找最後一個完整的句子或子句
1095
- final_truncated = ' '.join(words[:base_limit])
1096
-
1097
- # 尋找可能的子句結尾(逗號後的位置)
1098
- last_comma = final_truncated.rfind(',')
1099
- last_semicolon = final_truncated.rfind(';')
1100
-
1101
- # 選擇最佳截斷點
1102
- best_cutoff = max(last_period, last_comma, last_semicolon)
1103
-
1104
- if best_cutoff > len(final_truncated) * 0.6:
1105
- # 如果是逗號或分號結尾,改為句號
1106
- result = final_truncated[:best_cutoff]
1107
- if result.endswith(',') or result.endswith(';'):
1108
- result = result[:-1] + '.'
1109
- elif not result.endswith(('.', '!', '?')):
1110
- result += '.'
1111
-
1112
- self.logger.warning(f"Text truncated with intelligent cutoff at position {best_cutoff}")
1113
- return result
1114
-
1115
- # 移除可能不完整的最後一個句子
1116
- # 找到倒數第二個句子的結尾
1117
- second_last_period = final_truncated.rfind('.', 0, last_period)
1118
- if second_last_period > 0:
1119
- result = final_truncated[:second_last_period+1]
1120
- self.logger.warning("Text truncated by removing incomplete final sentence")
1121
- return result
1122
-
1123
- # 如果所有方法都失敗,添加適合的結尾
1124
- result = final_truncated.rstrip() + "."
1125
- self.logger.warning("Text truncated with forced period ending")
1126
- return result
1127
 
1128
  def _final_formatting(self, response: str) -> str:
1129
  """最終格式化處理"""
 
1
+ # %%writefile response_processor.py
2
  import re
3
  import logging
4
  import traceback
 
196
  raise ResponseProcessingError("Empty response provided for cleaning")
197
 
198
  try:
199
+ # 記錄清理前的最初回應
200
+ # self.logger.info(f"DEBUG: Response before cleaning: {response}")
201
 
202
  self.logger.debug(f"Starting response cleaning (original length: {len(response)})")
203
 
 
210
  else:
211
  cleaned_response = self._clean_general_response(response)
212
 
213
+ # 記錄清理後的回應
214
+ # self.logger.info(f"DEBUG: Response after cleaning: {cleaned_response}")
215
 
216
  # 如果清理後內容過短,嘗試從原始回應中恢復
217
  if len(cleaned_response.strip()) < 40:
 
316
  def _critical_format_preprocess(self, response: str) -> str:
317
  """
318
  關鍵格式預處理,處理最常見的格式問題
319
+
320
  Args:
321
  response: 原始回應
322
+
323
  Returns:
324
  str: 預處理後的回應
325
  """
326
  if not response:
327
  return response
328
+
329
  try:
330
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
+ # 第一優先級:處理斜線問題
333
  # 首先處理已知的斜線組合,使用形容詞替換
334
  for slash_combo, replacement in self.slash_replacements.items():
335
  if slash_combo.lower() in response.lower():
 
340
  replacement_formatted = replacement.title()
341
  else:
342
  replacement_formatted = replacement
343
+
344
  # 執行替換(不區分大小寫)
345
  response = re.sub(re.escape(slash_combo), replacement_formatted, response, flags=re.IGNORECASE)
346
+ self.logger.debug(f"Replaced slash pattern '{slash_combo}' with '{replacement_formatted}'")
347
+
348
  # 處理其他未預定義的斜線模式
349
+ # 標準斜線模式:word/word
350
  slash_pattern = r'\b([a-zA-Z]+)/([a-zA-Z]+)\b'
351
  matches = list(re.finditer(slash_pattern, response))
352
  for match in reversed(matches): # 從後往前處理避免位置偏移
 
357
  else:
358
  replacement = word2
359
  response = response[:match.start()] + replacement + response[match.end():]
360
+ self.logger.debug(f"Replaced general slash pattern '{match.group(0)}' with '{replacement}'")
361
+
362
+ # 第二優先級:處理底線格式
363
  # 首先處理已知的底線組合
364
  for underscore_combo, replacement in self.underscore_replacements.items():
365
  if underscore_combo in response:
366
  response = response.replace(underscore_combo, replacement)
367
+ self.logger.debug(f"Replaced underscore pattern '{underscore_combo}' with '{replacement}'")
368
+
369
  # 處理三個詞的底線組合:word_word_word → word word word
370
  response = re.sub(r'\b([a-z]+)_([a-z]+)_([a-z]+)\b', r'\1 \2 \3', response)
371
+
372
  # 處理任何剩餘的底線模式:word_word → word word
373
  response = re.sub(r'\b([a-zA-Z]+)_([a-zA-Z]+)\b', r'\1 \2', response)
374
+
375
+ # 第三優先級:修正不完整句子
376
  incomplete_sentence_fixes = [
377
  (r'\bIn\s*,\s*', 'Throughout the area, '),
378
  (r'\bOverall,\s+exudes\b', 'Overall, the scene exudes'),
 
380
  (r'\bwith its lights turned illuminating\b', 'with its lights illuminating'),
381
  (r'\bwhere it stands as\b', 'where it stands as'),
382
  ]
383
+
384
  for pattern, replacement in incomplete_sentence_fixes:
385
  response = re.sub(pattern, replacement, response, flags=re.IGNORECASE)
386
+
387
+ # 第四優先級:語法修正處理(像是person and people)
388
+ grammar_fixes = [
389
+ (r'\b(\d+)\s+persons\b', r'\1 people'),
390
+ (r'\bone\s+persons\b', 'one person'),
391
+ (r'\btwo\s+persons\b', 'two people'),
392
+ (r'\bthree\s+persons\b', 'three people'),
393
+ (r'\bfour\s+persons\b', 'four people'),
394
+ (r'\bfive\s+persons\b', 'five people'),
395
+ (r'\bsix\s+persons\b', 'six people'),
396
+ (r'\bseven\s+persons\b', 'seven people'),
397
+ (r'\beight\s+persons\b', 'eight people'),
398
+ (r'\bnine\s+persons\b', 'nine people'),
399
+ (r'\bten\s+persons\b', 'ten people'),
400
+ (r'\bmultiple\s+persons\b', 'multiple people'),
401
+ (r'\bseveral\s+persons\b', 'several people'),
402
+ (r'\bmany\s+persons\b', 'many people'),
403
+ (r'\ba\s+few\s+persons\b', 'a few people'),
404
+ (r'\bsome\s+persons\b', 'some people')
405
+ ]
406
+
407
+ for pattern, replacement in grammar_fixes:
408
+ response = re.sub(pattern, replacement, response, flags=re.IGNORECASE)
409
+
410
  return response
411
+
412
  except Exception as e:
413
  self.logger.warning(f"Error in critical format preprocessing: {str(e)}")
414
  return response
 
635
  return response # 發生錯誤時返回原始回應
636
 
637
  def _handle_repetitive_vocabulary(self, response: str) -> str:
638
+ """處理重複詞彙,使用 re.sub 和可呼叫的替換函數以提高效率和準確性。"""
639
  try:
640
+ # 檢測重複模式 (僅警告)
641
  if hasattr(self, 'repetitive_patterns'):
642
  for pattern, issue in self.repetitive_patterns:
643
+ if re.search(pattern, response, re.IGNORECASE | re.DOTALL):
 
644
  self.logger.warning(f"Text quality issue detected: {issue} in response: \"{response[:100]}...\"")
645
+
646
  if not hasattr(self, 'replacement_alternatives') or not self.replacement_alternatives:
647
  return response
648
+
649
  processed_response = response
650
+
 
651
  for word_to_replace, alternatives in self.replacement_alternatives.items():
652
+ if not alternatives: # 如果沒有可用的替代詞,則跳過
653
  continue
654
+
655
+ # 為每個詞創建一個獨立的計數器和替代索引
656
+ # 使用閉包或一個小類來封裝狀態
657
+ class WordReplacer:
658
+ def __init__(self, alternatives_list):
659
+ self.count = 0
660
+ self.alternative_idx = 0
661
+ self.alternatives_list = alternatives_list
662
+
663
+ def __call__(self, match_obj):
664
+ self.count += 1
665
+ original_word = match_obj.group(0)
666
+ if self.count > 1: # 從第二次出現開始替換
667
+ replacement = self.alternatives_list[self.alternative_idx % len(self.alternatives_list)]
668
+ self.alternative_idx += 1
669
+ # 保持原始大小寫格式
670
+ if original_word.isupper():
671
+ return replacement.upper()
672
+ elif original_word.istitle():
673
+ return replacement.capitalize()
674
+ return replacement
675
+ return original_word # 因為第一次出現, 就不用替換
676
+
677
+ replacer_instance = WordReplacer(alternatives)
678
+ # 使用 \b 確保匹配的是整個單詞
679
+ pattern = re.compile(r'\b' + re.escape(word_to_replace) + r'\b', re.IGNORECASE)
680
+ processed_response = pattern.sub(replacer_instance, processed_response)
681
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
682
  # 移除 identical 等重複性描述詞彙
683
  identical_cleanup_patterns = [
684
  (r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
 
688
  (r'\bcomprehensive view featuring\b', 'scene featuring'),
689
  (r'\bcomprehensive display of\b', 'display of'),
690
  ]
691
+
692
  for pattern, replacement in identical_cleanup_patterns:
693
  processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)
694
+
695
+ # 數字到文字
696
  number_conversions = {
697
  '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six',
698
  '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten',
699
  '11': 'eleven', '12': 'twelve'
700
  }
701
+
702
+ # 處理各種語法結構中的數字
703
  for digit, word in number_conversions.items():
704
+ # 模式1: 數字 + 單一複數詞 (如 "7 chairs")
705
+ pattern1 = rf'\b{digit}\s+([a-zA-Z]+s)\b'
706
+ processed_response = re.sub(pattern1, rf'{word} \1', processed_response)
707
+
708
+ # 模式2: 數字 + 修飾詞 + 複數詞 ( "7 more chairs")
709
+ pattern2 = rf'\b{digit}\s+(more|additional|other|identical)\s+([a-zA-Z]+s)\b'
710
+ processed_response = re.sub(pattern2, rf'{word} \1 \2', processed_response, flags=re.IGNORECASE)
711
+
712
+ # 模式3: 數字 + 形容詞 + 複數詞 (如 "2 dining tables")
713
+ pattern3 = rf'\b{digit}\s+([a-zA-Z]+)\s+([a-zA-Z]+s)\b'
714
+ processed_response = re.sub(pattern3, rf'{word} \1 \2', processed_response)
715
+
716
+ # 模式4: 介詞片語中的數字 (如 "around 2 tables")
717
+ pattern4 = rf'\b(around|approximately|about)\s+{digit}\s+([a-zA-Z]+s)\b'
718
+ processed_response = re.sub(pattern4, rf'\1 {word} \2', processed_response, flags=re.IGNORECASE)
719
+
720
  return processed_response
721
+
722
  except Exception as e:
723
  self.logger.error(f"Error in _handle_repetitive_vocabulary: {str(e)}")
724
  self.logger.error(traceback.format_exc())
725
+ return response # 發生錯誤時返回原始回應
726
 
727
  def _ensure_grammatical_completeness(self, response: str) -> str:
728
  """
 
989
  return response.strip()
990
 
991
  def _control_word_length(self, response: str) -> str:
992
+ """控制文字長度在合理範圍內"""
993
  words = response.split()
994
+ if len(words) > 200:
995
+ # 找到接近字數限制的句子結束處
996
+ truncated = ' '.join(words[:200])
997
+ last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
998
+
999
+ if last_period > 0:
1000
+ response = truncated[:last_period+1]
1001
+ else:
1002
+ response = truncated + "."
1003
+
1004
+ return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1005
 
1006
  def _final_formatting(self, response: str) -> str:
1007
  """最終格式化處理"""