Spaces:

DawnC
/

VisionScout

Running on Zero

App Files Files Community

DawnC commited on 27 days ago

Commit

4453070

verified ·

1 Parent(s): 2438acf

Update response_processor.py

Browse files

Files changed (1) hide show

response_processor.py +82 -155

response_processor.py CHANGED Viewed

@@ -1218,185 +1218,112 @@ class ResponseProcessor:
         if not re.search(r'[.!?]', response):
             raise ResponseProcessingError("Response lacks proper sentence structure")
-    def remove_explanatory_notes(self, response: str, debug: bool = False) -> str:
         """
-        移除 LLM 回應中的解釋性注釋和內部處理文字
         Args:
-            response: 包含可能注釋的回應文字
-            debug: 是否顯示除錯資訊
         Returns:
-            str: 移除注釋後的清理文字
         """
-        if not response or not response.strip():
-            return response
-        original_response = response
         try:
-            # 階段1：移除明確的注釋段落模式
-            note_patterns = [
-                # Note: 開頭的句子
-                r'(?:^|\n)\s*Note\s*:.*?(?:\n|$)',
-                # "I have" 開頭的解釋句
-                r'(?:^|\n)\s*I\s+have\s+(?:followed|adhered\s+to|ensured|strictly\s+adhered\s+to|also\s+followed).*?(?:\n|$)',
-                # "This description" 開頭的說明
-                r'(?:^|\n)\s*This\s+description\s+(?:follows|adheres\s+to|maintains).*?(?:\n|$)',
-                # "The enhanced description" 開頭的說明
-                r'(?:^|\n)\s*The\s+enhanced\s+description\s+(?:maintains|preserves).*?(?:\n|$)',
-                # "Additionally, I have" 模式
-                r'(?:^|\n)\s*Additionally,?\s*I\s+have.*?(?:\n|$)',
-                # "I've" 開頭的解釋
-                r'(?:^|\n)\s*I\'ve\s+(?:maintained|preserved|ensured|avoided).*?(?:\n|$)',
-                # "Please note" 開頭
-                r'(?:^|\n)\s*Please\s+note.*?(?:\n|$)',
-                # "Remember" 開頭
-                r'(?:^|\n)\s*Remember.*?(?:\n|$)',
-                # 括號內的解釋
-                r'\([^)]*(?:adhered|followed|rule|accuracy|speculation)[^)]*\)',
-                # "avoiding any assumptions" 相關
-                r'(?:^|\n).*?avoiding\s+any\s+(?:assumptions|inferences|speculation).*?(?:\n|$)',
-                # "object whitelist" 相關
-                r'(?:^|\n).*?object\s+whitelist.*?(?:\n|$)',
-                # "detail accuracy rule" 相關
-                r'(?:^|\n).*?detail\s+accuracy\s+rule.*?(?:\n|$)',
-                # "critical adherence" 相關
-                r'(?:^|\n).*?critical\s+adherence.*?(?:\n|$)',
-                # "transitional phrases" 相關
-                r'(?:^|\n).*?transitional\s+phrases.*?(?:\n|$)',
-                # "varying sentence structures" 相關
-                r'(?:^|\n).*?varying\s+sentence\s+structures.*?(?:\n|$)',
-                # "natural flow" 相關
-                r'(?:^|\n).*?natural\s+flow.*?(?:\n|$)',
-                # 長句形式的規則說明
-                r'(?:^|\n).*?(?:focused\s+on\s+describing|clear\s+and\s+concise\s+manner).*?(?:\n|$)'
             ]
-            # 階段2：處理段落分割
-            paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
-            if debug:
-                print(f"Original paragraphs count: {len(paragraphs)}")
-            # 階段3：如果只有一個段落，進行內部清理
-            if len(paragraphs) <= 1:
-                cleaned_text = response
-                for pattern in note_patterns:
-                    cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE | re.MULTILINE)
-                # 額外清理常見的問題短語
-                problematic_phrases = [
-                    r'\b(?:Note\s+that\s+)?I\s+have\s+strictly\s+adhered\s+to.*?\.?',
-                    r'\b(?:Additionally,?\s*)?I\s+have\s+followed.*?\.?',
-                    r'\b(?:I\s+have\s+)?avoided\s+(?:any\s+)?(?:assumptions|speculation).*?\.?',
-                    r'\busing\s+transitional\s+phrases.*?\.?',
-                    r'\bcreate\s+a\s+natural\s+flow.*?\.?'
-                ]
-                for phrase in problematic_phrases:
-                    cleaned_text = re.sub(phrase, '', cleaned_text, flags=re.IGNORECASE)
-                # 清理多餘空格和標點
-                cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
-                cleaned_text = re.sub(r'\s*,\s*,\s*', ', ', cleaned_text)
-                cleaned_text = re.sub(r'\s*\.\s*\.\s*', '. ', cleaned_text)
-                return cleaned_text.strip()
-            # 階段4：多段落處理 - 篩選內容段落
             content_paragraphs = []
             for paragraph in paragraphs:
-                is_explanatory = False
-                # 檢查是否為解釋性段落
-                for pattern in note_patterns:
-                    if re.search(pattern, paragraph, flags=re.IGNORECASE | re.MULTILINE):
-                        is_explanatory = True
-                        if debug:
-                            print(f"Removed explanatory paragraph: {paragraph[:50]}...")
-                        break
-                # 檢查常見的解釋性開頭
-                explanatory_starters = [
-                    'note:', 'please note:', 'remember:', 'i have followed',
-                    'i have adhered', 'i have strictly', 'additionally, i',
-                    'this description follows', 'the enhanced description',
-                    'i\'ve maintained', 'i\'ve preserved', 'i\'ve ensured'
-                ]
-                for starter in explanatory_starters:
-                    if paragraph.lower().startswith(starter):
-                        is_explanatory = True
-                        if debug:
-                            print(f"Removed paragraph starting with '{starter}': {paragraph[:50]}...")
                         break
-                # 檢查是否包含過多的規則相關詞彙
-                rule_keywords = ['adherence', 'whitelist', 'accuracy rule', 'assumptions',
-                               'inferences', 'speculation', 'transitional phrases']
-                keyword_count = sum(1 for keyword in rule_keywords if keyword in paragraph.lower())
-                if keyword_count >= 2:  # 如果包含2個以上規則關鍵詞，視為解釋性段落
-                    is_explanatory = True
-                    if debug:
-                        print(f"Removed rule-heavy paragraph: {paragraph[:50]}...")
-                # 保留非解釋性段落
-                if not is_explanatory:
                     content_paragraphs.append(paragraph)
-            # 階段5：重新組合段落
-            if content_paragraphs:
-                result = '\n\n'.join(content_paragraphs).strip()
-            else:
-                # 如果所有段落都被移除，嘗試保留最長的段落並進行基本清理
-                if paragraphs:
-                    longest_para = max(paragraphs, key=len)
-                    result = re.sub(r'(?:Note:.*?\.)|(?:\([^)]*rule[^)]*\))', '', longest_para, flags=re.IGNORECASE)
-                    result = re.sub(r'\s+', ' ', result).strip()
-                else:
-                    result = ""
-            # 階段6：最終清理
-            if result:
-                # 移除可能殘留的解釋性片段
-                result = re.sub(r'\s*,?\s*avoiding\s+any\s+(?:assumptions|speculation).*?(?=\.|$)', '', result, flags=re.IGNORECASE)
-                result = re.sub(r'\s*,?\s*using\s+(?:transitional\s+phrases|clear\s+and\s+concise).*?(?=\.|$)', '', result, flags=re.IGNORECASE)
-                # 標準化標點符號和空格
-                result = re.sub(r'\s+', ' ', result)
-                result = re.sub(r'\s*([,.!?])\s*', r'\1 ', result)
-                result = re.sub(r'\s+([,.!?])', r'\1', result)
-                result = result.strip()
-            if debug and hasattr(self, 'logger'):
-                self.logger.info(f"Cleaning completed. Original length: {len(original_response)}, Final length: {len(result)}")
-            return result if result else original_response
         except Exception as e:
-            # 如果處理過程中發生錯誤，返回原始文字
-            if debug and hasattr(self, 'logger'):
-                self.logger.error(f"Error during cleaning: {str(e)}")
-            return original_response
     def get_processor_info(self) -> Dict[str, Any]:
         """

         if not re.search(r'[.!?]', response):
             raise ResponseProcessingError("Response lacks proper sentence structure")
+    def remove_explanatory_notes(self, response: str) -> str:
         """
+        移除解釋性注釋和說明，特別針對 "Note that I..."
         Args:
+            response: 包含可能注釋的回應
         Returns:
+            str: 移除注釋後的回應
         """
         try:
+            # 專門針對 "Note that I..." 和相關解釋性敘述
+            specific_note_patterns = [
+                # Note that I have...
+                r'(?:^|\s)Note\s+that\s+I\s+have.*?(?=\s[A-Z]|\.|$)',
+                # I have strictly adhered...
+                r'(?:^|\s)I\s+have\s+strictly\s+adhered\s+to.*?(?=\s[A-Z]|\.|$)',
+                # I have followed/ensured...
+                r'(?:^|\s)I\s+have\s+(?:followed|ensured|also\s+followed).*?(?=\s[A-Z]|\.|$)',
+                # Additionally, I have...
+                r'(?:^|\s)Additionally,?\s*I\s+have.*?(?=\s[A-Z]|\.|$)',
+                # avoiding any assumptions...
+                r'(?:^|\s)avoiding\s+any\s+(?:assumptions|inferences).*?(?=\s[A-Z]|\.|$)',
+                # object whitelist and detail accuracy rule
+                r'(?:^|\s)(?:object\s+whitelist\s+and\s+detail\s+accuracy\s+rule|detail\s+accuracy\s+rule).*?(?=\s[A-Z]|\.|$)',
+                # using transitional phrases
+                r'(?:^|\s)using\s+transitional\s+phrases.*?(?=\s[A-Z]|\.|$)',
+                # create a natural flow
+                r'(?:^|\s)(?:and\s+have\s+focused\s+on|focused\s+on)\s+describing.*?natural\s+flow.*?(?=\s[A-Z]|\.|$)',
+                # critical adherence to input rule
+                r'(?:^|\s)critical\s+adherence\s+to\s+input\s+rule.*?(?=\s[A-Z]|\.|$)'
             ]
+            # 傳統的注釋和解釋模式
+            traditional_note_patterns = [
+                r'(?:^|\n)Note:.*?(?:\n|$)',
+                r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
+                r'(?:^|\n)This description (?:follows|adheres to|maintains).*?(?:\n|$)',
+                r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
+            ]
+            # 首先移除特定的 "Note that I..."
+            cleaned_response = response
+            for pattern in specific_note_patterns:
+                cleaned_response = re.sub(pattern, '', cleaned_response, flags=re.IGNORECASE)
+            # 清理多餘的標點符號和空格
+            cleaned_response = re.sub(r'\s*,\s*,\s*', ', ', cleaned_response)
+            cleaned_response = re.sub(r'\s*\.\s*\.\s*', '. ', cleaned_response)
+            cleaned_response = re.sub(r'\s+', ' ', cleaned_response)
+            # 修復可能出現的句子結尾問題
+            cleaned_response = re.sub(r'(\w)\s*,\s*$', r'\1.', cleaned_response)
+            cleaned_response = re.sub(r'(\w)\s*,\s*([A-Z])', r'\1. \2', cleaned_response)
+            # 尋找段落進行傳統處理
+            paragraphs = [p.strip() for p in cleaned_response.split('\n\n') if p.strip()]
+            # 如果只有一個段落，檢查並清理傳統注釋模式
+            if len(paragraphs) == 1:
+                for pattern in traditional_note_patterns:
+                    paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
+                return paragraphs[0].strip()
+            # 如果有多個段落，移除傳統注釋段落
             content_paragraphs = []
             for paragraph in paragraphs:
+                is_note = False
+                # 檢查傳統注釋模式
+                for pattern in traditional_note_patterns:
+                    if re.search(pattern, paragraph, flags=re.IGNORECASE):
+                        is_note = True
                         break
+                # 檢查段落是否以常見的注釋詞開頭
+                if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
+                    is_note = True
+                if not is_note:
                     content_paragraphs.append(paragraph)
+            result = '\n\n'.join(content_paragraphs).strip()
+            # 最終檢查：確保結果不為空
+            if not result or len(result.strip()) < 10:
+                # 如果處理後內容過短，返回去除特定模式後的原始內容
+                fallback_result = response
+                for pattern in specific_note_patterns:
+                    fallback_result = re.sub(pattern, '', fallback_result, flags=re.IGNORECASE)
+                fallback_result = re.sub(r'\s+', ' ', fallback_result).strip()
+                return fallback_result if fallback_result else response
+            return result
         except Exception as e:
+            self.logger.error(f"Failed to remove explanatory notes: {str(e)}")
+            return response
     def get_processor_info(self) -> Dict[str, Any]:
         """