Spaces:
Running
on
Zero
Running
on
Zero
Update response_processor.py
Browse files- response_processor.py +158 -70
response_processor.py
CHANGED
@@ -669,50 +669,88 @@ class ResponseProcessor:
|
|
669 |
return response # 發生錯誤時返回原始回應
|
670 |
|
671 |
def _handle_repetitive_vocabulary(self, response: str) -> str:
|
672 |
-
"""
|
673 |
try:
|
674 |
-
#
|
675 |
if hasattr(self, 'repetitive_patterns'):
|
676 |
for pattern, issue in self.repetitive_patterns:
|
677 |
-
|
|
|
678 |
self.logger.warning(f"Text quality issue detected: {issue} in response: \"{response[:100]}...\"")
|
679 |
-
|
680 |
if not hasattr(self, 'replacement_alternatives') or not self.replacement_alternatives:
|
681 |
return response
|
682 |
-
|
683 |
processed_response = response
|
684 |
-
|
|
|
685 |
for word_to_replace, alternatives in self.replacement_alternatives.items():
|
686 |
-
if not alternatives:
|
687 |
continue
|
688 |
-
|
689 |
-
#
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
716 |
# 移除 identical 等重複性描述詞彙
|
717 |
identical_cleanup_patterns = [
|
718 |
(r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
|
@@ -722,41 +760,35 @@ class ResponseProcessor:
|
|
722 |
(r'\bcomprehensive view featuring\b', 'scene featuring'),
|
723 |
(r'\bcomprehensive display of\b', 'display of'),
|
724 |
]
|
725 |
-
|
726 |
for pattern, replacement in identical_cleanup_patterns:
|
727 |
processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)
|
728 |
-
|
729 |
-
#
|
730 |
number_conversions = {
|
731 |
'2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six',
|
732 |
'7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten',
|
733 |
'11': 'eleven', '12': 'twelve'
|
734 |
}
|
735 |
-
|
736 |
-
# 處理各種語法結構中的數字
|
737 |
for digit, word in number_conversions.items():
|
738 |
-
#
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
# 模式4: 介詞片語中的數字 (如 "around 2 tables")
|
751 |
-
pattern4 = rf'\b(around|approximately|about)\s+{digit}\s+([a-zA-Z]+s)\b'
|
752 |
-
processed_response = re.sub(pattern4, rf'\1 {word} \2', processed_response, flags=re.IGNORECASE)
|
753 |
-
|
754 |
return processed_response
|
755 |
-
|
756 |
except Exception as e:
|
757 |
self.logger.error(f"Error in _handle_repetitive_vocabulary: {str(e)}")
|
758 |
self.logger.error(traceback.format_exc())
|
759 |
-
return response
|
760 |
|
761 |
def _ensure_grammatical_completeness(self, response: str) -> str:
|
762 |
"""
|
@@ -1023,19 +1055,75 @@ class ResponseProcessor:
|
|
1023 |
return response.strip()
|
1024 |
|
1025 |
def _control_word_length(self, response: str) -> str:
|
1026 |
-
"""
|
1027 |
words = response.split()
|
1028 |
-
|
1029 |
-
|
1030 |
-
|
1031 |
-
|
1032 |
-
|
1033 |
-
|
1034 |
-
|
1035 |
-
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1039 |
|
1040 |
def _final_formatting(self, response: str) -> str:
|
1041 |
"""最終格式化處理"""
|
|
|
669 |
return response # 發生錯誤時返回原始回應
|
670 |
|
671 |
def _handle_repetitive_vocabulary(self, response: str) -> str:
|
672 |
+
"""處理重複詞彙,使用改進的檢測和替換機制"""
|
673 |
try:
|
674 |
+
# 先進行重複模式檢測(記錄但不直接處理)
|
675 |
if hasattr(self, 'repetitive_patterns'):
|
676 |
for pattern, issue in self.repetitive_patterns:
|
677 |
+
matches = list(re.finditer(pattern, response, re.IGNORECASE | re.DOTALL))
|
678 |
+
if matches:
|
679 |
self.logger.warning(f"Text quality issue detected: {issue} in response: \"{response[:100]}...\"")
|
680 |
+
|
681 |
if not hasattr(self, 'replacement_alternatives') or not self.replacement_alternatives:
|
682 |
return response
|
683 |
+
|
684 |
processed_response = response
|
685 |
+
|
686 |
+
# 強化的重複詞彙處理
|
687 |
for word_to_replace, alternatives in self.replacement_alternatives.items():
|
688 |
+
if not alternatives:
|
689 |
continue
|
690 |
+
|
691 |
+
# 創建更精確的詞彙匹配模式
|
692 |
+
word_pattern = re.compile(r'\b' + re.escape(word_to_replace) + r'\b', re.IGNORECASE)
|
693 |
+
matches = list(word_pattern.finditer(processed_response))
|
694 |
+
|
695 |
+
if len(matches) <= 1:
|
696 |
+
continue # 如果只出現一次或沒有出現,跳過
|
697 |
+
|
698 |
+
# 對於多次出現的情況,進行智能替換
|
699 |
+
replacement_count = 0
|
700 |
+
alternative_index = 0
|
701 |
+
|
702 |
+
def smart_replacer(match_obj):
|
703 |
+
nonlocal replacement_count, alternative_index
|
704 |
+
replacement_count += 1
|
705 |
+
original_word = match_obj.group(0)
|
706 |
+
|
707 |
+
# 第一次出現保持原樣,後續出現進行替換
|
708 |
+
if replacement_count == 1:
|
709 |
+
return original_word
|
710 |
+
|
711 |
+
# 選擇適當的替代詞
|
712 |
+
replacement = alternatives[alternative_index % len(alternatives)]
|
713 |
+
alternative_index += 1
|
714 |
+
|
715 |
+
# 保持原始大小寫格式
|
716 |
+
if original_word.isupper():
|
717 |
+
return replacement.upper()
|
718 |
+
elif original_word.istitle():
|
719 |
+
return replacement.capitalize()
|
720 |
+
return replacement
|
721 |
+
|
722 |
+
processed_response = word_pattern.sub(smart_replacer, processed_response)
|
723 |
+
|
724 |
+
# === 新增:專門處理 "positioned" 的特殊邏輯 ===
|
725 |
+
# 由於 "positioned" 經常出現問題,給予特別處理
|
726 |
+
positioned_pattern = r'\b(positioned)\b'
|
727 |
+
positioned_matches = re.findall(positioned_pattern, processed_response, re.IGNORECASE)
|
728 |
+
|
729 |
+
if len(positioned_matches) > 1:
|
730 |
+
# 替換除了第一個以外的所有 "positioned"
|
731 |
+
positioned_alternatives = ['arranged', 'placed', 'set', 'located', 'situated']
|
732 |
+
replacement_counter = 0
|
733 |
+
|
734 |
+
def positioned_replacer(match):
|
735 |
+
nonlocal replacement_counter
|
736 |
+
if replacement_counter == 0:
|
737 |
+
replacement_counter += 1
|
738 |
+
return match.group(0) # 保持第一個不變
|
739 |
+
else:
|
740 |
+
alt_index = (replacement_counter - 1) % len(positioned_alternatives)
|
741 |
+
replacement_counter += 1
|
742 |
+
original = match.group(0)
|
743 |
+
new_word = positioned_alternatives[alt_index]
|
744 |
+
|
745 |
+
# 保持大小寫格式
|
746 |
+
if original.isupper():
|
747 |
+
return new_word.upper()
|
748 |
+
elif original.istitle():
|
749 |
+
return new_word.capitalize()
|
750 |
+
return new_word
|
751 |
+
|
752 |
+
processed_response = re.sub(positioned_pattern, positioned_replacer, processed_response, flags=re.IGNORECASE)
|
753 |
+
|
754 |
# 移除 identical 等重複性描述詞彙
|
755 |
identical_cleanup_patterns = [
|
756 |
(r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
|
|
|
760 |
(r'\bcomprehensive view featuring\b', 'scene featuring'),
|
761 |
(r'\bcomprehensive display of\b', 'display of'),
|
762 |
]
|
763 |
+
|
764 |
for pattern, replacement in identical_cleanup_patterns:
|
765 |
processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)
|
766 |
+
|
767 |
+
# 數字到文字轉換(保持原有邏輯)
|
768 |
number_conversions = {
|
769 |
'2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six',
|
770 |
'7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten',
|
771 |
'11': 'eleven', '12': 'twelve'
|
772 |
}
|
773 |
+
|
|
|
774 |
for digit, word in number_conversions.items():
|
775 |
+
# 各種數字模式的處理
|
776 |
+
patterns_to_fix = [
|
777 |
+
(rf'\b{digit}\s+([a-zA-Z]+s)\b', rf'{word} \1'),
|
778 |
+
(rf'\b{digit}\s+(more|additional|other|identical)\s+([a-zA-Z]+s)\b', rf'{word} \1 \2'),
|
779 |
+
(rf'\b{digit}\s+([a-zA-Z]+)\s+([a-zA-Z]+s)\b', rf'{word} \1 \2'),
|
780 |
+
(rf'\b(around|approximately|about)\s+{digit}\s+([a-zA-Z]+s)\b', rf'\1 {word} \2'),
|
781 |
+
]
|
782 |
+
|
783 |
+
for pattern, replacement in patterns_to_fix:
|
784 |
+
processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)
|
785 |
+
|
|
|
|
|
|
|
|
|
|
|
786 |
return processed_response
|
787 |
+
|
788 |
except Exception as e:
|
789 |
self.logger.error(f"Error in _handle_repetitive_vocabulary: {str(e)}")
|
790 |
self.logger.error(traceback.format_exc())
|
791 |
+
return response
|
792 |
|
793 |
def _ensure_grammatical_completeness(self, response: str) -> str:
|
794 |
"""
|
|
|
1055 |
return response.strip()
|
1056 |
|
1057 |
def _control_word_length(self, response: str) -> str:
|
1058 |
+
"""控制文字長度在合理範圍內,確保句子完整性"""
|
1059 |
words = response.split()
|
1060 |
+
|
1061 |
+
# 提高基礎限制,給予更多彈性
|
1062 |
+
base_limit = 220
|
1063 |
+
extended_limit = 250
|
1064 |
+
|
1065 |
+
if len(words) <= base_limit:
|
1066 |
+
return response
|
1067 |
+
|
1068 |
+
# 首先嘗試在基礎限制內找到完整句子
|
1069 |
+
truncated = ' '.join(words[:base_limit])
|
1070 |
+
last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
|
1071 |
+
|
1072 |
+
# 如果在基礎限制內找到了適當的句子結尾
|
1073 |
+
if last_period > len(truncated) * 0.8: # 確保截斷點不會太早
|
1074 |
+
result = truncated[:last_period+1]
|
1075 |
+
self.logger.info(f"Text truncated at {base_limit} words with proper sentence ending")
|
1076 |
+
return result
|
1077 |
+
|
1078 |
+
# 如果基礎限制內沒有找到合適結尾,擴展搜尋範圍
|
1079 |
+
if len(words) > extended_limit:
|
1080 |
+
extended_truncated = ' '.join(words[:extended_limit])
|
1081 |
+
extended_last_period = max(
|
1082 |
+
extended_truncated.rfind('.'),
|
1083 |
+
extended_truncated.rfind('!'),
|
1084 |
+
extended_truncated.rfind('?')
|
1085 |
+
)
|
1086 |
+
|
1087 |
+
# 在擴展範圍內找到合適的結尾
|
1088 |
+
if extended_last_period > len(extended_truncated) * 0.7:
|
1089 |
+
result = extended_truncated[:extended_last_period+1]
|
1090 |
+
self.logger.info(f"Text truncated at extended limit with proper sentence ending")
|
1091 |
+
return result
|
1092 |
+
|
1093 |
+
# 如果仍然找不到合適的結尾,使用智能截斷
|
1094 |
+
# 尋找最後一個完整的句子或子句
|
1095 |
+
final_truncated = ' '.join(words[:base_limit])
|
1096 |
+
|
1097 |
+
# 尋找可能的子句結尾(逗號後的位置)
|
1098 |
+
last_comma = final_truncated.rfind(',')
|
1099 |
+
last_semicolon = final_truncated.rfind(';')
|
1100 |
+
|
1101 |
+
# 選擇最佳截斷點
|
1102 |
+
best_cutoff = max(last_period, last_comma, last_semicolon)
|
1103 |
+
|
1104 |
+
if best_cutoff > len(final_truncated) * 0.6:
|
1105 |
+
# 如果是逗號或分號結尾,改為句號
|
1106 |
+
result = final_truncated[:best_cutoff]
|
1107 |
+
if result.endswith(',') or result.endswith(';'):
|
1108 |
+
result = result[:-1] + '.'
|
1109 |
+
elif not result.endswith(('.', '!', '?')):
|
1110 |
+
result += '.'
|
1111 |
+
|
1112 |
+
self.logger.warning(f"Text truncated with intelligent cutoff at position {best_cutoff}")
|
1113 |
+
return result
|
1114 |
+
|
1115 |
+
# 移除可能不完整的最後一個句子
|
1116 |
+
# 找到倒數第二個句子的結尾
|
1117 |
+
second_last_period = final_truncated.rfind('.', 0, last_period)
|
1118 |
+
if second_last_period > 0:
|
1119 |
+
result = final_truncated[:second_last_period+1]
|
1120 |
+
self.logger.warning("Text truncated by removing incomplete final sentence")
|
1121 |
+
return result
|
1122 |
+
|
1123 |
+
# 如果所有方法都失敗,添加適合的結尾
|
1124 |
+
result = final_truncated.rstrip() + "."
|
1125 |
+
self.logger.warning("Text truncated with forced period ending")
|
1126 |
+
return result
|
1127 |
|
1128 |
def _final_formatting(self, response: str) -> str:
|
1129 |
"""最終格式化處理"""
|