DawnC commited on
Commit
a43ff7a
·
verified ·
1 Parent(s): ea980d5

Update response_processor.py

Browse files
Files changed (1) hide show
  1. response_processor.py +158 -70
response_processor.py CHANGED
@@ -669,50 +669,88 @@ class ResponseProcessor:
669
  return response # 發生錯誤時返回原始回應
670
 
671
  def _handle_repetitive_vocabulary(self, response: str) -> str:
672
- """處理重複詞彙,使用 re.sub 和可呼叫的替換函數以提高效率和準確性。"""
673
  try:
674
- # 檢測重複模式 (僅警告)
675
  if hasattr(self, 'repetitive_patterns'):
676
  for pattern, issue in self.repetitive_patterns:
677
- if re.search(pattern, response, re.IGNORECASE | re.DOTALL):
 
678
  self.logger.warning(f"Text quality issue detected: {issue} in response: \"{response[:100]}...\"")
679
-
680
  if not hasattr(self, 'replacement_alternatives') or not self.replacement_alternatives:
681
  return response
682
-
683
  processed_response = response
684
-
 
685
  for word_to_replace, alternatives in self.replacement_alternatives.items():
686
- if not alternatives: # 如果沒有可用的替代詞,則跳過
687
  continue
688
-
689
- # 為每個詞創建一個獨立的計數器和替代索引
690
- # 使用閉包或一個小類來封裝狀態
691
- class WordReplacer:
692
- def __init__(self, alternatives_list):
693
- self.count = 0
694
- self.alternative_idx = 0
695
- self.alternatives_list = alternatives_list
696
-
697
- def __call__(self, match_obj):
698
- self.count += 1
699
- original_word = match_obj.group(0)
700
- if self.count > 1: # 從第二次出現開始替換
701
- replacement = self.alternatives_list[self.alternative_idx % len(self.alternatives_list)]
702
- self.alternative_idx += 1
703
- # 保持原始大小寫格式
704
- if original_word.isupper():
705
- return replacement.upper()
706
- elif original_word.istitle():
707
- return replacement.capitalize()
708
- return replacement
709
- return original_word # 因為第一次出現, 就不用替換
710
-
711
- replacer_instance = WordReplacer(alternatives)
712
- # 使用 \b 確保匹配的是整個單詞
713
- pattern = re.compile(r'\b' + re.escape(word_to_replace) + r'\b', re.IGNORECASE)
714
- processed_response = pattern.sub(replacer_instance, processed_response)
715
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
  # 移除 identical 等重複性描述詞彙
717
  identical_cleanup_patterns = [
718
  (r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
@@ -722,41 +760,35 @@ class ResponseProcessor:
722
  (r'\bcomprehensive view featuring\b', 'scene featuring'),
723
  (r'\bcomprehensive display of\b', 'display of'),
724
  ]
725
-
726
  for pattern, replacement in identical_cleanup_patterns:
727
  processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)
728
-
729
- # 數字到文字
730
  number_conversions = {
731
  '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six',
732
  '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten',
733
  '11': 'eleven', '12': 'twelve'
734
  }
735
-
736
- # 處理各種語法結構中的數字
737
  for digit, word in number_conversions.items():
738
- # 模式1: 數字 + 單一複數詞 (如 "7 chairs")
739
- pattern1 = rf'\b{digit}\s+([a-zA-Z]+s)\b'
740
- processed_response = re.sub(pattern1, rf'{word} \1', processed_response)
741
-
742
- # 模式2: 數字 + 修飾詞 + 複數詞 ( "7 more chairs")
743
- pattern2 = rf'\b{digit}\s+(more|additional|other|identical)\s+([a-zA-Z]+s)\b'
744
- processed_response = re.sub(pattern2, rf'{word} \1 \2', processed_response, flags=re.IGNORECASE)
745
-
746
- # 模式3: 數字 + 形容詞 + 複數詞 (如 "2 dining tables")
747
- pattern3 = rf'\b{digit}\s+([a-zA-Z]+)\s+([a-zA-Z]+s)\b'
748
- processed_response = re.sub(pattern3, rf'{word} \1 \2', processed_response)
749
-
750
- # 模式4: 介詞片語中的數字 (如 "around 2 tables")
751
- pattern4 = rf'\b(around|approximately|about)\s+{digit}\s+([a-zA-Z]+s)\b'
752
- processed_response = re.sub(pattern4, rf'\1 {word} \2', processed_response, flags=re.IGNORECASE)
753
-
754
  return processed_response
755
-
756
  except Exception as e:
757
  self.logger.error(f"Error in _handle_repetitive_vocabulary: {str(e)}")
758
  self.logger.error(traceback.format_exc())
759
- return response # 發生錯誤時返回原始回應
760
 
761
  def _ensure_grammatical_completeness(self, response: str) -> str:
762
  """
@@ -1023,19 +1055,75 @@ class ResponseProcessor:
1023
  return response.strip()
1024
 
1025
  def _control_word_length(self, response: str) -> str:
1026
- """控制文字長度在合理範圍內"""
1027
  words = response.split()
1028
- if len(words) > 200:
1029
- # 找到接近字數限制的句子結束處
1030
- truncated = ' '.join(words[:200])
1031
- last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
1032
-
1033
- if last_period > 0:
1034
- response = truncated[:last_period+1]
1035
- else:
1036
- response = truncated + "."
1037
-
1038
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1039
 
1040
  def _final_formatting(self, response: str) -> str:
1041
  """最終格式化處理"""
 
669
  return response # 發生錯誤時返回原始回應
670
 
671
  def _handle_repetitive_vocabulary(self, response: str) -> str:
672
+ """處理重複詞彙,使用改進的檢測和替換機制"""
673
  try:
674
+ # 先進行重複模式檢測(記錄但不直接處理)
675
  if hasattr(self, 'repetitive_patterns'):
676
  for pattern, issue in self.repetitive_patterns:
677
+ matches = list(re.finditer(pattern, response, re.IGNORECASE | re.DOTALL))
678
+ if matches:
679
  self.logger.warning(f"Text quality issue detected: {issue} in response: \"{response[:100]}...\"")
680
+
681
  if not hasattr(self, 'replacement_alternatives') or not self.replacement_alternatives:
682
  return response
683
+
684
  processed_response = response
685
+
686
+ # 強化的重複詞彙處理
687
  for word_to_replace, alternatives in self.replacement_alternatives.items():
688
+ if not alternatives:
689
  continue
690
+
691
+ # 創建更精確的詞彙匹配模式
692
+ word_pattern = re.compile(r'\b' + re.escape(word_to_replace) + r'\b', re.IGNORECASE)
693
+ matches = list(word_pattern.finditer(processed_response))
694
+
695
+ if len(matches) <= 1:
696
+ continue # 如果只出現一次或沒有出現,跳過
697
+
698
+ # 對於多次出現的情況,進行智能替換
699
+ replacement_count = 0
700
+ alternative_index = 0
701
+
702
+ def smart_replacer(match_obj):
703
+ nonlocal replacement_count, alternative_index
704
+ replacement_count += 1
705
+ original_word = match_obj.group(0)
706
+
707
+ # 第一次出現保持原樣,後續出現進行替換
708
+ if replacement_count == 1:
709
+ return original_word
710
+
711
+ # 選擇適當的替代詞
712
+ replacement = alternatives[alternative_index % len(alternatives)]
713
+ alternative_index += 1
714
+
715
+ # 保持原始大小寫格式
716
+ if original_word.isupper():
717
+ return replacement.upper()
718
+ elif original_word.istitle():
719
+ return replacement.capitalize()
720
+ return replacement
721
+
722
+ processed_response = word_pattern.sub(smart_replacer, processed_response)
723
+
724
+ # === 新增:專門處理 "positioned" 的特殊邏輯 ===
725
+ # 由於 "positioned" 經常出現問題,給予特別處理
726
+ positioned_pattern = r'\b(positioned)\b'
727
+ positioned_matches = re.findall(positioned_pattern, processed_response, re.IGNORECASE)
728
+
729
+ if len(positioned_matches) > 1:
730
+ # 替換除了第一個以外的所有 "positioned"
731
+ positioned_alternatives = ['arranged', 'placed', 'set', 'located', 'situated']
732
+ replacement_counter = 0
733
+
734
+ def positioned_replacer(match):
735
+ nonlocal replacement_counter
736
+ if replacement_counter == 0:
737
+ replacement_counter += 1
738
+ return match.group(0) # 保持第一個不變
739
+ else:
740
+ alt_index = (replacement_counter - 1) % len(positioned_alternatives)
741
+ replacement_counter += 1
742
+ original = match.group(0)
743
+ new_word = positioned_alternatives[alt_index]
744
+
745
+ # 保持大小寫格式
746
+ if original.isupper():
747
+ return new_word.upper()
748
+ elif original.istitle():
749
+ return new_word.capitalize()
750
+ return new_word
751
+
752
+ processed_response = re.sub(positioned_pattern, positioned_replacer, processed_response, flags=re.IGNORECASE)
753
+
754
  # 移除 identical 等重複性描述詞彙
755
  identical_cleanup_patterns = [
756
  (r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
 
760
  (r'\bcomprehensive view featuring\b', 'scene featuring'),
761
  (r'\bcomprehensive display of\b', 'display of'),
762
  ]
763
+
764
  for pattern, replacement in identical_cleanup_patterns:
765
  processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)
766
+
767
+ # 數字到文字轉換(保持原有邏輯)
768
  number_conversions = {
769
  '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six',
770
  '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten',
771
  '11': 'eleven', '12': 'twelve'
772
  }
773
+
 
774
  for digit, word in number_conversions.items():
775
+ # 各種數字模式的處理
776
+ patterns_to_fix = [
777
+ (rf'\b{digit}\s+([a-zA-Z]+s)\b', rf'{word} \1'),
778
+ (rf'\b{digit}\s+(more|additional|other|identical)\s+([a-zA-Z]+s)\b', rf'{word} \1 \2'),
779
+ (rf'\b{digit}\s+([a-zA-Z]+)\s+([a-zA-Z]+s)\b', rf'{word} \1 \2'),
780
+ (rf'\b(around|approximately|about)\s+{digit}\s+([a-zA-Z]+s)\b', rf'\1 {word} \2'),
781
+ ]
782
+
783
+ for pattern, replacement in patterns_to_fix:
784
+ processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)
785
+
 
 
 
 
 
786
  return processed_response
787
+
788
  except Exception as e:
789
  self.logger.error(f"Error in _handle_repetitive_vocabulary: {str(e)}")
790
  self.logger.error(traceback.format_exc())
791
+ return response
792
 
793
  def _ensure_grammatical_completeness(self, response: str) -> str:
794
  """
 
1055
  return response.strip()
1056
 
1057
  def _control_word_length(self, response: str) -> str:
1058
+ """控制文字長度在合理範圍內,確保句子完整性"""
1059
  words = response.split()
1060
+
1061
+ # 提高基礎限制,給予更多彈性
1062
+ base_limit = 220
1063
+ extended_limit = 250
1064
+
1065
+ if len(words) <= base_limit:
1066
+ return response
1067
+
1068
+ # 首先嘗試在基礎限制內找到完整句子
1069
+ truncated = ' '.join(words[:base_limit])
1070
+ last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
1071
+
1072
+ # 如果在基礎限制內找到了適當的句子結尾
1073
+ if last_period > len(truncated) * 0.8: # 確保截斷點不會太早
1074
+ result = truncated[:last_period+1]
1075
+ self.logger.info(f"Text truncated at {base_limit} words with proper sentence ending")
1076
+ return result
1077
+
1078
+ # 如果基礎限制內沒有找到合適結尾,擴展搜尋範圍
1079
+ if len(words) > extended_limit:
1080
+ extended_truncated = ' '.join(words[:extended_limit])
1081
+ extended_last_period = max(
1082
+ extended_truncated.rfind('.'),
1083
+ extended_truncated.rfind('!'),
1084
+ extended_truncated.rfind('?')
1085
+ )
1086
+
1087
+ # 在擴展範圍內找到合適的結尾
1088
+ if extended_last_period > len(extended_truncated) * 0.7:
1089
+ result = extended_truncated[:extended_last_period+1]
1090
+ self.logger.info(f"Text truncated at extended limit with proper sentence ending")
1091
+ return result
1092
+
1093
+ # 如果仍然找不到合適的結尾,使用智能截斷
1094
+ # 尋找最後一個完整的句子或子句
1095
+ final_truncated = ' '.join(words[:base_limit])
1096
+
1097
+ # 尋找可能的子句結尾(逗號後的位置)
1098
+ last_comma = final_truncated.rfind(',')
1099
+ last_semicolon = final_truncated.rfind(';')
1100
+
1101
+ # 選擇最佳截斷點
1102
+ best_cutoff = max(last_period, last_comma, last_semicolon)
1103
+
1104
+ if best_cutoff > len(final_truncated) * 0.6:
1105
+ # 如果是逗號或分號結尾,改為句號
1106
+ result = final_truncated[:best_cutoff]
1107
+ if result.endswith(',') or result.endswith(';'):
1108
+ result = result[:-1] + '.'
1109
+ elif not result.endswith(('.', '!', '?')):
1110
+ result += '.'
1111
+
1112
+ self.logger.warning(f"Text truncated with intelligent cutoff at position {best_cutoff}")
1113
+ return result
1114
+
1115
+ # 移除可能不完整的最後一個句子
1116
+ # 找到倒數第二個句子的結尾
1117
+ second_last_period = final_truncated.rfind('.', 0, last_period)
1118
+ if second_last_period > 0:
1119
+ result = final_truncated[:second_last_period+1]
1120
+ self.logger.warning("Text truncated by removing incomplete final sentence")
1121
+ return result
1122
+
1123
+ # 如果所有方法都失敗,添加適合的結尾
1124
+ result = final_truncated.rstrip() + "."
1125
+ self.logger.warning("Text truncated with forced period ending")
1126
+ return result
1127
 
1128
  def _final_formatting(self, response: str) -> str:
1129
  """最終格式化處理"""