Spaces:

DawnC
/

VisionScout

Running on Zero

App Files Files Community

DawnC commited on 2 days ago

Commit

a43ff7a

verified ·

1 Parent(s): ea980d5

Update response_processor.py

Browse files

Files changed (1) hide show

response_processor.py +158 -70

response_processor.py CHANGED Viewed

@@ -669,50 +669,88 @@ class ResponseProcessor:
             return response # 發生錯誤時返回原始回應
     def _handle_repetitive_vocabulary(self, response: str) -> str:
-        """處理重複詞彙，使用 re.sub 和可呼叫的替換函數以提高效率和準確性。"""
         try:
-            # 檢測重複模式 (僅警告)
             if hasattr(self, 'repetitive_patterns'):
                 for pattern, issue in self.repetitive_patterns:
-                    if re.search(pattern, response, re.IGNORECASE | re.DOTALL):
                         self.logger.warning(f"Text quality issue detected: {issue} in response: \"{response[:100]}...\"")
             if not hasattr(self, 'replacement_alternatives') or not self.replacement_alternatives:
                 return response
             processed_response = response
             for word_to_replace, alternatives in self.replacement_alternatives.items():
-                if not alternatives:  # 如果沒有可用的替代詞，則跳過
                     continue
-                # 為每個詞創建一個獨立的計數器和替代索引
-                # 使用閉包或一個小類來封裝狀態
-                class WordReplacer:
-                    def __init__(self, alternatives_list):
-                        self.count = 0
-                        self.alternative_idx = 0
-                        self.alternatives_list = alternatives_list
-                    def __call__(self, match_obj):
-                        self.count += 1
-                        original_word = match_obj.group(0)
-                        if self.count > 1:  # 從第二次出現開始替換
-                            replacement = self.alternatives_list[self.alternative_idx % len(self.alternatives_list)]
-                            self.alternative_idx += 1
-                            # 保持原始大小寫格式
-                            if original_word.isupper():
-                                return replacement.upper()
-                            elif original_word.istitle():
-                                return replacement.capitalize()
-                            return replacement
-                        return original_word # 因為第一次出現, 就不用替換
-                replacer_instance = WordReplacer(alternatives)
-                # 使用 \b 確保匹配的是整個單詞
-                pattern = re.compile(r'\b' + re.escape(word_to_replace) + r'\b', re.IGNORECASE)
-                processed_response = pattern.sub(replacer_instance, processed_response)
             # 移除 identical 等重複性描述詞彙
             identical_cleanup_patterns = [
                 (r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
@@ -722,41 +760,35 @@ class ResponseProcessor:
                 (r'\bcomprehensive view featuring\b', 'scene featuring'),
                 (r'\bcomprehensive display of\b', 'display of'),
             ]
             for pattern, replacement in identical_cleanup_patterns:
                 processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)
-            # 數字到文字
             number_conversions = {
                 '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six',
                 '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten',
                 '11': 'eleven', '12': 'twelve'
             }
-            # 處理各種語法結構中的數字
             for digit, word in number_conversions.items():
-                # 模式1: 數字 + 單一複數詞 (如 "7 chairs")
-                pattern1 = rf'\b{digit}\s+([a-zA-Z]+s)\b'
-                processed_response = re.sub(pattern1, rf'{word} \1', processed_response)
-                # 模式2: 數字 + 修飾詞 + 複數詞 (如 "7 more chairs")
-                pattern2 = rf'\b{digit}\s+(more|additional|other|identical)\s+([a-zA-Z]+s)\b'
-                processed_response = re.sub(pattern2, rf'{word} \1 \2', processed_response, flags=re.IGNORECASE)
-                # 模式3: 數字 + 形容詞 + 複數詞 (如 "2 dining tables")
-                pattern3 = rf'\b{digit}\s+([a-zA-Z]+)\s+([a-zA-Z]+s)\b'
-                processed_response = re.sub(pattern3, rf'{word} \1 \2', processed_response)
-                # 模式4: 介詞片語中的數字 (如 "around 2 tables")
-                pattern4 = rf'\b(around|approximately|about)\s+{digit}\s+([a-zA-Z]+s)\b'
-                processed_response = re.sub(pattern4, rf'\1 {word} \2', processed_response, flags=re.IGNORECASE)
             return processed_response
         except Exception as e:
             self.logger.error(f"Error in _handle_repetitive_vocabulary: {str(e)}")
             self.logger.error(traceback.format_exc())
-            return response # 發生錯誤時返回原始回應
     def _ensure_grammatical_completeness(self, response: str) -> str:
         """
@@ -1023,19 +1055,75 @@ class ResponseProcessor:
         return response.strip()
     def _control_word_length(self, response: str) -> str:
-        """控制文字長度在合理範圍內"""
         words = response.split()
-        if len(words) > 200:
-            # 找到接近字數限制的句子結束處
-            truncated = ' '.join(words[:200])
-            last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
-            if last_period > 0:
-                response = truncated[:last_period+1]
-            else:
-                response = truncated + "."
-        return response
     def _final_formatting(self, response: str) -> str:
         """最終格式化處理"""

             return response # 發生錯誤時返回原始回應
     def _handle_repetitive_vocabulary(self, response: str) -> str:
+        """處理重複詞彙，使用改進的檢測和替換機制"""
         try:
+            # 先進行重複模式檢測（記錄但不直接處理）
             if hasattr(self, 'repetitive_patterns'):
                 for pattern, issue in self.repetitive_patterns:
+                    matches = list(re.finditer(pattern, response, re.IGNORECASE | re.DOTALL))
+                    if matches:
                         self.logger.warning(f"Text quality issue detected: {issue} in response: \"{response[:100]}...\"")
             if not hasattr(self, 'replacement_alternatives') or not self.replacement_alternatives:
                 return response
             processed_response = response
+            # 強化的重複詞彙處理
             for word_to_replace, alternatives in self.replacement_alternatives.items():
+                if not alternatives:
                     continue
+                # 創建更精確的詞彙匹配模式
+                word_pattern = re.compile(r'\b' + re.escape(word_to_replace) + r'\b', re.IGNORECASE)
+                matches = list(word_pattern.finditer(processed_response))
+                if len(matches) <= 1:
+                    continue  # 如果只出現一次或沒有出現，跳過
+                # 對於多次出現的情況，進行智能替換
+                replacement_count = 0
+                alternative_index = 0
+                def smart_replacer(match_obj):
+                    nonlocal replacement_count, alternative_index
+                    replacement_count += 1
+                    original_word = match_obj.group(0)
+                    # 第一次出現保持原樣，後續出現進行替換
+                    if replacement_count == 1:
+                        return original_word
+                    # 選擇適當的替代詞
+                    replacement = alternatives[alternative_index % len(alternatives)]
+                    alternative_index += 1
+                    # 保持原始大小寫格式
+                    if original_word.isupper():
+                        return replacement.upper()
+                    elif original_word.istitle():
+                        return replacement.capitalize()
+                    return replacement
+                processed_response = word_pattern.sub(smart_replacer, processed_response)
+            # === 新增：專門處理 "positioned" 的特殊邏輯 ===
+            # 由於 "positioned" 經常出現問題，給予特別處理
+            positioned_pattern = r'\b(positioned)\b'
+            positioned_matches = re.findall(positioned_pattern, processed_response, re.IGNORECASE)
+            if len(positioned_matches) > 1:
+                # 替換除了第一個以外的所有 "positioned"
+                positioned_alternatives = ['arranged', 'placed', 'set', 'located', 'situated']
+                replacement_counter = 0
+                def positioned_replacer(match):
+                    nonlocal replacement_counter
+                    if replacement_counter == 0:
+                        replacement_counter += 1
+                        return match.group(0)  # 保持第一個不變
+                    else:
+                        alt_index = (replacement_counter - 1) % len(positioned_alternatives)
+                        replacement_counter += 1
+                        original = match.group(0)
+                        new_word = positioned_alternatives[alt_index]
+                        # 保持大小寫格式
+                        if original.isupper():
+                            return new_word.upper()
+                        elif original.istitle():
+                            return new_word.capitalize()
+                        return new_word
+                processed_response = re.sub(positioned_pattern, positioned_replacer, processed_response, flags=re.IGNORECASE)
             # 移除 identical 等重複性描述詞彙
             identical_cleanup_patterns = [
                 (r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
                 (r'\bcomprehensive view featuring\b', 'scene featuring'),
                 (r'\bcomprehensive display of\b', 'display of'),
             ]
             for pattern, replacement in identical_cleanup_patterns:
                 processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)
+            # 數字到文字轉換（保持原有邏輯）
             number_conversions = {
                 '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six',
                 '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten',
                 '11': 'eleven', '12': 'twelve'
             }
             for digit, word in number_conversions.items():
+                # 各種數字模式的處理
+                patterns_to_fix = [
+                    (rf'\b{digit}\s+([a-zA-Z]+s)\b', rf'{word} \1'),
+                    (rf'\b{digit}\s+(more|additional|other|identical)\s+([a-zA-Z]+s)\b', rf'{word} \1 \2'),
+                    (rf'\b{digit}\s+([a-zA-Z]+)\s+([a-zA-Z]+s)\b', rf'{word} \1 \2'),
+                    (rf'\b(around|approximately|about)\s+{digit}\s+([a-zA-Z]+s)\b', rf'\1 {word} \2'),
+                ]
+                for pattern, replacement in patterns_to_fix:
+                    processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)
             return processed_response
         except Exception as e:
             self.logger.error(f"Error in _handle_repetitive_vocabulary: {str(e)}")
             self.logger.error(traceback.format_exc())
+            return response
     def _ensure_grammatical_completeness(self, response: str) -> str:
         """
         return response.strip()
     def _control_word_length(self, response: str) -> str:
+        """控制文字長度在合理範圍內，確保句子完整性"""
         words = response.split()
+        # 提高基礎限制，給予更多彈性
+        base_limit = 220
+        extended_limit = 250
+        if len(words) <= base_limit:
+            return response
+        # 首先嘗試在基礎限制內找到完整句子
+        truncated = ' '.join(words[:base_limit])
+        last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
+        # 如果在基礎限制內找到了適當的句子結尾
+        if last_period > len(truncated) * 0.8:  # 確保截斷點不會太早
+            result = truncated[:last_period+1]
+            self.logger.info(f"Text truncated at {base_limit} words with proper sentence ending")
+            return result
+        # 如果基礎限制內沒有找到合適結尾，擴展搜尋範圍
+        if len(words) > extended_limit:
+            extended_truncated = ' '.join(words[:extended_limit])
+            extended_last_period = max(
+                extended_truncated.rfind('.'),
+                extended_truncated.rfind('!'),
+                extended_truncated.rfind('?')
+            )
+            # 在擴展範圍內找到合適的結尾
+            if extended_last_period > len(extended_truncated) * 0.7:
+                result = extended_truncated[:extended_last_period+1]
+                self.logger.info(f"Text truncated at extended limit with proper sentence ending")
+                return result
+        # 如果仍然找不到合適的結尾，使用智能截斷
+        # 尋找最後一個完整的句子或子句
+        final_truncated = ' '.join(words[:base_limit])
+        # 尋找可能的子句結尾（逗號後的位置）
+        last_comma = final_truncated.rfind(',')
+        last_semicolon = final_truncated.rfind(';')
+        # 選擇最佳截斷點
+        best_cutoff = max(last_period, last_comma, last_semicolon)
+        if best_cutoff > len(final_truncated) * 0.6:
+            # 如果是逗號或分號結尾，改為句號
+            result = final_truncated[:best_cutoff]
+            if result.endswith(',') or result.endswith(';'):
+                result = result[:-1] + '.'
+            elif not result.endswith(('.', '!', '?')):
+                result += '.'
+            self.logger.warning(f"Text truncated with intelligent cutoff at position {best_cutoff}")
+            return result
+        # 移除可能不完整的最後一個句子
+        # 找到倒數第二個句子的結尾
+        second_last_period = final_truncated.rfind('.', 0, last_period)
+        if second_last_period > 0:
+            result = final_truncated[:second_last_period+1]
+            self.logger.warning("Text truncated by removing incomplete final sentence")
+            return result
+        # 如果所有方法都失敗，添加適合的結尾
+        result = final_truncated.rstrip() + "."
+        self.logger.warning("Text truncated with forced period ending")
+        return result
     def _final_formatting(self, response: str) -> str:
         """最終格式化處理"""