Spaces:

DawnC
/

VisionScout

Running on Zero

App Files Files Community

DawnC commited on 1 day ago

Commit

5895031

verified ·

1 Parent(s): c42671c

Update response_processor.py

Browse files

Files changed (1) hide show

response_processor.py +164 -34

response_processor.py CHANGED Viewed

@@ -1218,55 +1218,185 @@ class ResponseProcessor:
         if not re.search(r'[.!?]', response):
             raise ResponseProcessingError("Response lacks proper sentence structure")
-    def remove_explanatory_notes(self, response: str) -> str:
         """
-        移除解釋性注釋和說明
         Args:
-            response: 包含可能注釋的回應
         Returns:
-            str: 移除注釋後的回應
         """
         try:
-            # 識別常見的注釋和解釋模式
             note_patterns = [
-                r'(?:^|\n)Note:.*?(?:\n|$)',
-                r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
-                r'(?:^|\n)This description (?:follows|adheres to|maintains).*?(?:\n|$)',
-                r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
             ]
-            # 尋找段落
             paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
-            # 如果只有一個段落，檢查並清理它
-            if len(paragraphs) == 1:
                 for pattern in note_patterns:
-                    paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
-                return paragraphs[0].strip()
-            # 如果有多個段落，移除注釋段落
             content_paragraphs = []
             for paragraph in paragraphs:
-                is_note = False
                 for pattern in note_patterns:
-                    if re.search(pattern, paragraph, flags=re.IGNORECASE):
-                        is_note = True
                         break
-                # 檢查段落是否以常見的注釋詞開頭
-                if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
-                    is_note = True
-                if not is_note:
                     content_paragraphs.append(paragraph)
-            return '\n\n'.join(content_paragraphs).strip()
         except Exception as e:
-            self.logger.error(f"Failed to remove explanatory notes: {str(e)}")
-            return response
     def get_processor_info(self) -> Dict[str, Any]:
         """

         if not re.search(r'[.!?]', response):
             raise ResponseProcessingError("Response lacks proper sentence structure")
+    def remove_explanatory_notes(response: str, debug: bool = False) -> str:
         """
+        移除 LLM 回應中的解釋性注釋和內部處理文字
         Args:
+            response: 包含可能注釋的回應文字
+            debug: 是否顯示除錯資訊
         Returns:
+            str: 移除注釋後的清理文字
         """
+        if not response or not response.strip():
+            return response
+        original_response = response
         try:
+            # 階段1：移除明確的注釋段落模式
             note_patterns = [
+                # Note: 開頭的句子
+                r'(?:^|\n)\s*Note\s*:.*?(?:\n|$)',
+                # "I have" 開頭的解釋句
+                r'(?:^|\n)\s*I\s+have\s+(?:followed|adhered\s+to|ensured|strictly\s+adhered\s+to|also\s+followed).*?(?:\n|$)',
+                # "This description" 開頭的說明
+                r'(?:^|\n)\s*This\s+description\s+(?:follows|adheres\s+to|maintains).*?(?:\n|$)',
+                # "The enhanced description" 開頭的說明
+                r'(?:^|\n)\s*The\s+enhanced\s+description\s+(?:maintains|preserves).*?(?:\n|$)',
+                # "Additionally, I have" 模式
+                r'(?:^|\n)\s*Additionally,?\s*I\s+have.*?(?:\n|$)',
+                # "I've" 開頭的解釋
+                r'(?:^|\n)\s*I\'ve\s+(?:maintained|preserved|ensured|avoided).*?(?:\n|$)',
+                # "Please note" 開頭
+                r'(?:^|\n)\s*Please\s+note.*?(?:\n|$)',
+                # "Remember" 開頭
+                r'(?:^|\n)\s*Remember.*?(?:\n|$)',
+                # 括號內的解釋
+                r'\([^)]*(?:adhered|followed|rule|accuracy|speculation)[^)]*\)',
+                # "avoiding any assumptions" 相關
+                r'(?:^|\n).*?avoiding\s+any\s+(?:assumptions|inferences|speculation).*?(?:\n|$)',
+                # "object whitelist" 相關
+                r'(?:^|\n).*?object\s+whitelist.*?(?:\n|$)',
+                # "detail accuracy rule" 相關
+                r'(?:^|\n).*?detail\s+accuracy\s+rule.*?(?:\n|$)',
+                # "critical adherence" 相關
+                r'(?:^|\n).*?critical\s+adherence.*?(?:\n|$)',
+                # "transitional phrases" 相關
+                r'(?:^|\n).*?transitional\s+phrases.*?(?:\n|$)',
+                # "varying sentence structures" 相關
+                r'(?:^|\n).*?varying\s+sentence\s+structures.*?(?:\n|$)',
+                # "natural flow" 相關
+                r'(?:^|\n).*?natural\s+flow.*?(?:\n|$)',
+                # 長句形式的規則說明
+                r'(?:^|\n).*?(?:focused\s+on\s+describing|clear\s+and\s+concise\s+manner).*?(?:\n|$)'
             ]
+            # 階段2：處理段落分割
             paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
+            if debug:
+                print(f"Original paragraphs count: {len(paragraphs)}")
+            # 階段3：如果只有一個段落，進行內部清理
+            if len(paragraphs) <= 1:
+                cleaned_text = response
                 for pattern in note_patterns:
+                    cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE | re.MULTILINE)
+                # 額外清理常見的問題短語
+                problematic_phrases = [
+                    r'\b(?:Note\s+that\s+)?I\s+have\s+strictly\s+adhered\s+to.*?\.?',
+                    r'\b(?:Additionally,?\s*)?I\s+have\s+followed.*?\.?',
+                    r'\b(?:I\s+have\s+)?avoided\s+(?:any\s+)?(?:assumptions|speculation).*?\.?',
+                    r'\busing\s+transitional\s+phrases.*?\.?',
+                    r'\bcreate\s+a\s+natural\s+flow.*?\.?'
+                ]
+                for phrase in problematic_phrases:
+                    cleaned_text = re.sub(phrase, '', cleaned_text, flags=re.IGNORECASE)
+                # 清理多餘空格和標點
+                cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
+                cleaned_text = re.sub(r'\s*,\s*,\s*', ', ', cleaned_text)
+                cleaned_text = re.sub(r'\s*\.\s*\.\s*', '. ', cleaned_text)
+                return cleaned_text.strip()
+            # 階段4：多段落處理 - 篩選內容段落
             content_paragraphs = []
             for paragraph in paragraphs:
+                is_explanatory = False
+                # 檢查是否為解釋性段落
                 for pattern in note_patterns:
+                    if re.search(pattern, paragraph, flags=re.IGNORECASE | re.MULTILINE):
+                        is_explanatory = True
+                        if debug:
+                            print(f"Removed explanatory paragraph: {paragraph[:50]}...")
                         break
+                # 檢查常見的解釋性開頭
+                explanatory_starters = [
+                    'note:', 'please note:', 'remember:', 'i have followed',
+                    'i have adhered', 'i have strictly', 'additionally, i',
+                    'this description follows', 'the enhanced description',
+                    'i\'ve maintained', 'i\'ve preserved', 'i\'ve ensured'
+                ]
+                for starter in explanatory_starters:
+                    if paragraph.lower().startswith(starter):
+                        is_explanatory = True
+                        if debug:
+                            print(f"Removed paragraph starting with '{starter}': {paragraph[:50]}...")
+                        break
+                # 檢查是否包含過多的規則相關詞彙
+                rule_keywords = ['adherence', 'whitelist', 'accuracy rule', 'assumptions',
+                               'inferences', 'speculation', 'transitional phrases']
+                keyword_count = sum(1 for keyword in rule_keywords if keyword in paragraph.lower())
+                if keyword_count >= 2:  # 如果包含2個以上規則關鍵詞，視為解釋性段落
+                    is_explanatory = True
+                    if debug:
+                        print(f"Removed rule-heavy paragraph: {paragraph[:50]}...")
+                # 保留非解釋性段落
+                if not is_explanatory:
                     content_paragraphs.append(paragraph)
+            # 階段5：重新組合段落
+            if content_paragraphs:
+                result = '\n\n'.join(content_paragraphs).strip()
+            else:
+                # 如果所有段落都被移除，嘗試保留最長的段落並進行基本清理
+                if paragraphs:
+                    longest_para = max(paragraphs, key=len)
+                    result = re.sub(r'(?:Note:.*?\.)|(?:\([^)]*rule[^)]*\))', '', longest_para, flags=re.IGNORECASE)
+                    result = re.sub(r'\s+', ' ', result).strip()
+                else:
+                    result = ""
+            # 階段6：最終清理
+            if result:
+                # 移除可能殘留的解釋性片段
+                result = re.sub(r'\s*,?\s*avoiding\s+any\s+(?:assumptions|speculation).*?(?=\.|$)', '', result, flags=re.IGNORECASE)
+                result = re.sub(r'\s*,?\s*using\s+(?:transitional\s+phrases|clear\s+and\s+concise).*?(?=\.|$)', '', result, flags=re.IGNORECASE)
+                # 標準化標點符號和空格
+                result = re.sub(r'\s+', ' ', result)
+                result = re.sub(r'\s*([,.!?])\s*', r'\1 ', result)
+                result = re.sub(r'\s+([,.!?])', r'\1', result)
+                result = result.strip()
+            if debug:
+                print(f"Cleaning completed. Original length: {len(original_response)}, Final length: {len(result)}")
+            return result if result else original_response
         except Exception as e:
+            # 如果處理過程中發生錯誤，返回原始文字
+            if debug:
+                print(f"Error during cleaning: {str(e)}")
+            return original_response
     def get_processor_info(self) -> Dict[str, Any]:
         """