DawnC commited on
Commit
4453070
·
verified ·
1 Parent(s): 2438acf

Update response_processor.py

Browse files
Files changed (1) hide show
  1. response_processor.py +82 -155
response_processor.py CHANGED
@@ -1218,185 +1218,112 @@ class ResponseProcessor:
1218
  if not re.search(r'[.!?]', response):
1219
  raise ResponseProcessingError("Response lacks proper sentence structure")
1220
 
1221
- def remove_explanatory_notes(self, response: str, debug: bool = False) -> str:
1222
  """
1223
- 移除 LLM 回應中的解釋性注釋和內部處理文字
1224
-
1225
  Args:
1226
- response: 包含可能注釋的回應文字
1227
- debug: 是否顯示除錯資訊
1228
-
1229
  Returns:
1230
- str: 移除注釋後的清理文字
1231
  """
1232
- if not response or not response.strip():
1233
- return response
1234
-
1235
- original_response = response
1236
-
1237
  try:
1238
- # 階段1:移除明確的注釋段落模式
1239
- note_patterns = [
1240
- # Note: 開頭的句子
1241
- r'(?:^|\n)\s*Note\s*:.*?(?:\n|$)',
1242
-
1243
- # "I have" 開頭的解釋句
1244
- r'(?:^|\n)\s*I\s+have\s+(?:followed|adhered\s+to|ensured|strictly\s+adhered\s+to|also\s+followed).*?(?:\n|$)',
1245
-
1246
- # "This description" 開頭的說明
1247
- r'(?:^|\n)\s*This\s+description\s+(?:follows|adheres\s+to|maintains).*?(?:\n|$)',
1248
-
1249
- # "The enhanced description" 開頭的說明
1250
- r'(?:^|\n)\s*The\s+enhanced\s+description\s+(?:maintains|preserves).*?(?:\n|$)',
1251
-
1252
- # "Additionally, I have" 模式
1253
- r'(?:^|\n)\s*Additionally,?\s*I\s+have.*?(?:\n|$)',
1254
-
1255
- # "I've" 開頭的解釋
1256
- r'(?:^|\n)\s*I\'ve\s+(?:maintained|preserved|ensured|avoided).*?(?:\n|$)',
1257
 
1258
- # "Please note" 開頭
1259
- r'(?:^|\n)\s*Please\s+note.*?(?:\n|$)',
1260
 
1261
- # "Remember" 開頭
1262
- r'(?:^|\n)\s*Remember.*?(?:\n|$)',
1263
 
1264
- # 括號內的解釋
1265
- r'\([^)]*(?:adhered|followed|rule|accuracy|speculation)[^)]*\)',
1266
 
1267
- # "avoiding any assumptions" 相關
1268
- r'(?:^|\n).*?avoiding\s+any\s+(?:assumptions|inferences|speculation).*?(?:\n|$)',
1269
 
1270
- # "object whitelist" 相關
1271
- r'(?:^|\n).*?object\s+whitelist.*?(?:\n|$)',
1272
 
1273
- # "detail accuracy rule" 相關
1274
- r'(?:^|\n).*?detail\s+accuracy\s+rule.*?(?:\n|$)',
1275
 
1276
- # "critical adherence" 相關
1277
- r'(?:^|\n).*?critical\s+adherence.*?(?:\n|$)',
1278
 
1279
- # "transitional phrases" 相關
1280
- r'(?:^|\n).*?transitional\s+phrases.*?(?:\n|$)',
1281
-
1282
- # "varying sentence structures" 相關
1283
- r'(?:^|\n).*?varying\s+sentence\s+structures.*?(?:\n|$)',
1284
-
1285
- # "natural flow" 相關
1286
- r'(?:^|\n).*?natural\s+flow.*?(?:\n|$)',
1287
-
1288
- # 長句形式的規則說明
1289
- r'(?:^|\n).*?(?:focused\s+on\s+describing|clear\s+and\s+concise\s+manner).*?(?:\n|$)'
1290
  ]
1291
 
1292
- # 階段2:處理段落分割
1293
- paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
1294
-
1295
- if debug:
1296
- print(f"Original paragraphs count: {len(paragraphs)}")
1297
-
1298
- # 階段3:如果只有一個段落,進行內部清理
1299
- if len(paragraphs) <= 1:
1300
- cleaned_text = response
1301
- for pattern in note_patterns:
1302
- cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE | re.MULTILINE)
1303
-
1304
- # 額外清理常見的問題短語
1305
- problematic_phrases = [
1306
- r'\b(?:Note\s+that\s+)?I\s+have\s+strictly\s+adhered\s+to.*?\.?',
1307
- r'\b(?:Additionally,?\s*)?I\s+have\s+followed.*?\.?',
1308
- r'\b(?:I\s+have\s+)?avoided\s+(?:any\s+)?(?:assumptions|speculation).*?\.?',
1309
- r'\busing\s+transitional\s+phrases.*?\.?',
1310
- r'\bcreate\s+a\s+natural\s+flow.*?\.?'
1311
- ]
1312
-
1313
- for phrase in problematic_phrases:
1314
- cleaned_text = re.sub(phrase, '', cleaned_text, flags=re.IGNORECASE)
1315
-
1316
- # 清理多餘空格和標點
1317
- cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
1318
- cleaned_text = re.sub(r'\s*,\s*,\s*', ', ', cleaned_text)
1319
- cleaned_text = re.sub(r'\s*\.\s*\.\s*', '. ', cleaned_text)
1320
-
1321
- return cleaned_text.strip()
1322
 
1323
- # 階段4:多段落處理 - 篩選內容段落
 
 
 
 
 
 
 
 
 
 
 
 
 
1324
  content_paragraphs = []
1325
-
1326
  for paragraph in paragraphs:
1327
- is_explanatory = False
1328
-
1329
- # 檢查是否為解釋性段落
1330
- for pattern in note_patterns:
1331
- if re.search(pattern, paragraph, flags=re.IGNORECASE | re.MULTILINE):
1332
- is_explanatory = True
1333
- if debug:
1334
- print(f"Removed explanatory paragraph: {paragraph[:50]}...")
1335
- break
1336
-
1337
- # 檢查常見的解釋性開頭
1338
- explanatory_starters = [
1339
- 'note:', 'please note:', 'remember:', 'i have followed',
1340
- 'i have adhered', 'i have strictly', 'additionally, i',
1341
- 'this description follows', 'the enhanced description',
1342
- 'i\'ve maintained', 'i\'ve preserved', 'i\'ve ensured'
1343
- ]
1344
 
1345
- for starter in explanatory_starters:
1346
- if paragraph.lower().startswith(starter):
1347
- is_explanatory = True
1348
- if debug:
1349
- print(f"Removed paragraph starting with '{starter}': {paragraph[:50]}...")
1350
  break
1351
-
1352
- # 檢查是否包含過多的規則相關詞彙
1353
- rule_keywords = ['adherence', 'whitelist', 'accuracy rule', 'assumptions',
1354
- 'inferences', 'speculation', 'transitional phrases']
1355
- keyword_count = sum(1 for keyword in rule_keywords if keyword in paragraph.lower())
1356
-
1357
- if keyword_count >= 2: # 如果包含2個以上規則關鍵詞,視為解釋性段落
1358
- is_explanatory = True
1359
- if debug:
1360
- print(f"Removed rule-heavy paragraph: {paragraph[:50]}...")
1361
-
1362
- # 保留非解釋性段落
1363
- if not is_explanatory:
1364
  content_paragraphs.append(paragraph)
 
 
1365
 
1366
- # 階段5:重新組合段落
1367
- if content_paragraphs:
1368
- result = '\n\n'.join(content_paragraphs).strip()
1369
- else:
1370
- # 如果所有段落都被移除,嘗試保留最長的段落並進行基本清理
1371
- if paragraphs:
1372
- longest_para = max(paragraphs, key=len)
1373
- result = re.sub(r'(?:Note:.*?\.)|(?:\([^)]*rule[^)]*\))', '', longest_para, flags=re.IGNORECASE)
1374
- result = re.sub(r'\s+', ' ', result).strip()
1375
- else:
1376
- result = ""
1377
-
1378
- # 階段6:最終清理
1379
- if result:
1380
- # 移除可能殘留的解釋性片段
1381
- result = re.sub(r'\s*,?\s*avoiding\s+any\s+(?:assumptions|speculation).*?(?=\.|$)', '', result, flags=re.IGNORECASE)
1382
- result = re.sub(r'\s*,?\s*using\s+(?:transitional\s+phrases|clear\s+and\s+concise).*?(?=\.|$)', '', result, flags=re.IGNORECASE)
1383
 
1384
- # 標準化標點符號和空格
1385
- result = re.sub(r'\s+', ' ', result)
1386
- result = re.sub(r'\s*([,.!?])\s*', r'\1 ', result)
1387
- result = re.sub(r'\s+([,.!?])', r'\1', result)
1388
- result = result.strip()
1389
-
1390
- if debug and hasattr(self, 'logger'):
1391
- self.logger.info(f"Cleaning completed. Original length: {len(original_response)}, Final length: {len(result)}")
1392
-
1393
- return result if result else original_response
1394
-
1395
  except Exception as e:
1396
- # 如果處理過程中發生錯誤,返回原始文字
1397
- if debug and hasattr(self, 'logger'):
1398
- self.logger.error(f"Error during cleaning: {str(e)}")
1399
- return original_response
1400
 
1401
  def get_processor_info(self) -> Dict[str, Any]:
1402
  """
 
1218
  if not re.search(r'[.!?]', response):
1219
  raise ResponseProcessingError("Response lacks proper sentence structure")
1220
 
1221
+ def remove_explanatory_notes(self, response: str) -> str:
1222
  """
1223
+ 移除解釋性注釋和說明,特別針對 "Note that I..."
1224
+
1225
  Args:
1226
+ response: 包含可能注釋的回應
1227
+
 
1228
  Returns:
1229
+ str: 移除注釋後的回應
1230
  """
 
 
 
 
 
1231
  try:
1232
+ # 專門針對 "Note that I..." 和相關解釋性敘述
1233
+ specific_note_patterns = [
1234
+ # Note that I have...
1235
+ r'(?:^|\s)Note\s+that\s+I\s+have.*?(?=\s[A-Z]|\.|$)',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1236
 
1237
+ # I have strictly adhered...
1238
+ r'(?:^|\s)I\s+have\s+strictly\s+adhered\s+to.*?(?=\s[A-Z]|\.|$)',
1239
 
1240
+ # I have followed/ensured...
1241
+ r'(?:^|\s)I\s+have\s+(?:followed|ensured|also\s+followed).*?(?=\s[A-Z]|\.|$)',
1242
 
1243
+ # Additionally, I have...
1244
+ r'(?:^|\s)Additionally,?\s*I\s+have.*?(?=\s[A-Z]|\.|$)',
1245
 
1246
+ # avoiding any assumptions...
1247
+ r'(?:^|\s)avoiding\s+any\s+(?:assumptions|inferences).*?(?=\s[A-Z]|\.|$)',
1248
 
1249
+ # object whitelist and detail accuracy rule
1250
+ r'(?:^|\s)(?:object\s+whitelist\s+and\s+detail\s+accuracy\s+rule|detail\s+accuracy\s+rule).*?(?=\s[A-Z]|\.|$)',
1251
 
1252
+ # using transitional phrases
1253
+ r'(?:^|\s)using\s+transitional\s+phrases.*?(?=\s[A-Z]|\.|$)',
1254
 
1255
+ # create a natural flow
1256
+ r'(?:^|\s)(?:and\s+have\s+focused\s+on|focused\s+on)\s+describing.*?natural\s+flow.*?(?=\s[A-Z]|\.|$)',
1257
 
1258
+ # critical adherence to input rule
1259
+ r'(?:^|\s)critical\s+adherence\s+to\s+input\s+rule.*?(?=\s[A-Z]|\.|$)'
 
 
 
 
 
 
 
 
 
1260
  ]
1261
 
1262
+ # 傳統的注釋和解釋模式
1263
+ traditional_note_patterns = [
1264
+ r'(?:^|\n)Note:.*?(?:\n|$)',
1265
+ r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
1266
+ r'(?:^|\n)This description (?:follows|adheres to|maintains).*?(?:\n|$)',
1267
+ r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
1268
+ ]
1269
+
1270
+ # 首先移除特定的 "Note that I..."
1271
+ cleaned_response = response
1272
+ for pattern in specific_note_patterns:
1273
+ cleaned_response = re.sub(pattern, '', cleaned_response, flags=re.IGNORECASE)
1274
+
1275
+ # 清理多餘的標點符號和空格
1276
+ cleaned_response = re.sub(r'\s*,\s*,\s*', ', ', cleaned_response)
1277
+ cleaned_response = re.sub(r'\s*\.\s*\.\s*', '. ', cleaned_response)
1278
+ cleaned_response = re.sub(r'\s+', ' ', cleaned_response)
 
 
 
 
 
 
 
 
 
 
 
 
 
1279
 
1280
+ # 修復可能出現的句子結尾問題
1281
+ cleaned_response = re.sub(r'(\w)\s*,\s*$', r'\1.', cleaned_response)
1282
+ cleaned_response = re.sub(r'(\w)\s*,\s*([A-Z])', r'\1. \2', cleaned_response)
1283
+
1284
+ # 尋找段落進行傳統處理
1285
+ paragraphs = [p.strip() for p in cleaned_response.split('\n\n') if p.strip()]
1286
+
1287
+ # 如果只有一個段落,檢查並清理傳統注釋模式
1288
+ if len(paragraphs) == 1:
1289
+ for pattern in traditional_note_patterns:
1290
+ paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
1291
+ return paragraphs[0].strip()
1292
+
1293
+ # 如果有多個段落,移除傳統注釋段落
1294
  content_paragraphs = []
 
1295
  for paragraph in paragraphs:
1296
+ is_note = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1297
 
1298
+ # 檢查傳統注釋模式
1299
+ for pattern in traditional_note_patterns:
1300
+ if re.search(pattern, paragraph, flags=re.IGNORECASE):
1301
+ is_note = True
 
1302
  break
1303
+
1304
+ # 檢查段落是否以常見的注釋詞開頭
1305
+ if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
1306
+ is_note = True
1307
+
1308
+ if not is_note:
 
 
 
 
 
 
 
1309
  content_paragraphs.append(paragraph)
1310
+
1311
+ result = '\n\n'.join(content_paragraphs).strip()
1312
 
1313
+ # 最終檢查:確保結果不為空
1314
+ if not result or len(result.strip()) < 10:
1315
+ # 如果處理後內容過短,返回去除特定模式後的原始內容
1316
+ fallback_result = response
1317
+ for pattern in specific_note_patterns:
1318
+ fallback_result = re.sub(pattern, '', fallback_result, flags=re.IGNORECASE)
1319
+ fallback_result = re.sub(r'\s+', ' ', fallback_result).strip()
1320
+ return fallback_result if fallback_result else response
 
 
 
 
 
 
 
 
 
1321
 
1322
+ return result
1323
+
 
 
 
 
 
 
 
 
 
1324
  except Exception as e:
1325
+ self.logger.error(f"Failed to remove explanatory notes: {str(e)}")
1326
+ return response
 
 
1327
 
1328
  def get_processor_info(self) -> Dict[str, Any]:
1329
  """