DawnC commited on
Commit
5895031
·
verified ·
1 Parent(s): c42671c

Update response_processor.py

Browse files
Files changed (1) hide show
  1. response_processor.py +164 -34
response_processor.py CHANGED
@@ -1218,55 +1218,185 @@ class ResponseProcessor:
1218
  if not re.search(r'[.!?]', response):
1219
  raise ResponseProcessingError("Response lacks proper sentence structure")
1220
 
1221
- def remove_explanatory_notes(self, response: str) -> str:
1222
  """
1223
- 移除解釋性注釋和說明
1224
-
1225
  Args:
1226
- response: 包含可能注釋的回應
1227
-
 
1228
  Returns:
1229
- str: 移除注釋後的回應
1230
  """
 
 
 
 
 
1231
  try:
1232
- # 識別常見的注釋和解釋模式
1233
  note_patterns = [
1234
- r'(?:^|\n)Note:.*?(?:\n|$)',
1235
- r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
1236
- r'(?:^|\n)This description (?:follows|adheres to|maintains).*?(?:\n|$)',
1237
- r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1238
  ]
1239
-
1240
- # 尋找段落
1241
  paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
1242
-
1243
- # 如果只有一個段落,檢查並清理它
1244
- if len(paragraphs) == 1:
 
 
 
 
1245
  for pattern in note_patterns:
1246
- paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
1247
- return paragraphs[0].strip()
1248
-
1249
- # 如果有多個段落,移除注釋段落
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1250
  content_paragraphs = []
 
1251
  for paragraph in paragraphs:
1252
- is_note = False
 
 
1253
  for pattern in note_patterns:
1254
- if re.search(pattern, paragraph, flags=re.IGNORECASE):
1255
- is_note = True
 
 
1256
  break
1257
-
1258
- # 檢查段落是否以常見的注釋詞開頭
1259
- if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
1260
- is_note = True
1261
-
1262
- if not is_note:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1263
  content_paragraphs.append(paragraph)
1264
-
1265
- return '\n\n'.join(content_paragraphs).strip()
1266
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1267
  except Exception as e:
1268
- self.logger.error(f"Failed to remove explanatory notes: {str(e)}")
1269
- return response
 
 
1270
 
1271
  def get_processor_info(self) -> Dict[str, Any]:
1272
  """
 
1218
  if not re.search(r'[.!?]', response):
1219
  raise ResponseProcessingError("Response lacks proper sentence structure")
1220
 
1221
+ def remove_explanatory_notes(response: str, debug: bool = False) -> str:
1222
  """
1223
+ 移除 LLM 回應中的解釋性注釋和內部處理文字
1224
+
1225
  Args:
1226
+ response: 包含可能注釋的回應文字
1227
+ debug: 是否顯示除錯資訊
1228
+
1229
  Returns:
1230
+ str: 移除注釋後的清理文字
1231
  """
1232
+ if not response or not response.strip():
1233
+ return response
1234
+
1235
+ original_response = response
1236
+
1237
  try:
1238
+ # 階段1:移除明確的注釋段落模式
1239
  note_patterns = [
1240
+ # Note: 開頭的句子
1241
+ r'(?:^|\n)\s*Note\s*:.*?(?:\n|$)',
1242
+
1243
+ # "I have" 開頭的解釋句
1244
+ r'(?:^|\n)\s*I\s+have\s+(?:followed|adhered\s+to|ensured|strictly\s+adhered\s+to|also\s+followed).*?(?:\n|$)',
1245
+
1246
+ # "This description" 開頭的說明
1247
+ r'(?:^|\n)\s*This\s+description\s+(?:follows|adheres\s+to|maintains).*?(?:\n|$)',
1248
+
1249
+ # "The enhanced description" 開頭的說明
1250
+ r'(?:^|\n)\s*The\s+enhanced\s+description\s+(?:maintains|preserves).*?(?:\n|$)',
1251
+
1252
+ # "Additionally, I have" 模式
1253
+ r'(?:^|\n)\s*Additionally,?\s*I\s+have.*?(?:\n|$)',
1254
+
1255
+ # "I've" 開頭的解釋
1256
+ r'(?:^|\n)\s*I\'ve\s+(?:maintained|preserved|ensured|avoided).*?(?:\n|$)',
1257
+
1258
+ # "Please note" 開頭
1259
+ r'(?:^|\n)\s*Please\s+note.*?(?:\n|$)',
1260
+
1261
+ # "Remember" 開頭
1262
+ r'(?:^|\n)\s*Remember.*?(?:\n|$)',
1263
+
1264
+ # 括號內的解釋
1265
+ r'\([^)]*(?:adhered|followed|rule|accuracy|speculation)[^)]*\)',
1266
+
1267
+ # "avoiding any assumptions" 相關
1268
+ r'(?:^|\n).*?avoiding\s+any\s+(?:assumptions|inferences|speculation).*?(?:\n|$)',
1269
+
1270
+ # "object whitelist" 相關
1271
+ r'(?:^|\n).*?object\s+whitelist.*?(?:\n|$)',
1272
+
1273
+ # "detail accuracy rule" 相關
1274
+ r'(?:^|\n).*?detail\s+accuracy\s+rule.*?(?:\n|$)',
1275
+
1276
+ # "critical adherence" 相關
1277
+ r'(?:^|\n).*?critical\s+adherence.*?(?:\n|$)',
1278
+
1279
+ # "transitional phrases" 相關
1280
+ r'(?:^|\n).*?transitional\s+phrases.*?(?:\n|$)',
1281
+
1282
+ # "varying sentence structures" 相關
1283
+ r'(?:^|\n).*?varying\s+sentence\s+structures.*?(?:\n|$)',
1284
+
1285
+ # "natural flow" 相關
1286
+ r'(?:^|\n).*?natural\s+flow.*?(?:\n|$)',
1287
+
1288
+ # 長句形式的規則說明
1289
+ r'(?:^|\n).*?(?:focused\s+on\s+describing|clear\s+and\s+concise\s+manner).*?(?:\n|$)'
1290
  ]
1291
+
1292
+ # 階段2:處理段落分割
1293
  paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
1294
+
1295
+ if debug:
1296
+ print(f"Original paragraphs count: {len(paragraphs)}")
1297
+
1298
+ # 階段3:如果只有一個段落,進行內部清理
1299
+ if len(paragraphs) <= 1:
1300
+ cleaned_text = response
1301
  for pattern in note_patterns:
1302
+ cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE | re.MULTILINE)
1303
+
1304
+ # 額外清理常見的問題短語
1305
+ problematic_phrases = [
1306
+ r'\b(?:Note\s+that\s+)?I\s+have\s+strictly\s+adhered\s+to.*?\.?',
1307
+ r'\b(?:Additionally,?\s*)?I\s+have\s+followed.*?\.?',
1308
+ r'\b(?:I\s+have\s+)?avoided\s+(?:any\s+)?(?:assumptions|speculation).*?\.?',
1309
+ r'\busing\s+transitional\s+phrases.*?\.?',
1310
+ r'\bcreate\s+a\s+natural\s+flow.*?\.?'
1311
+ ]
1312
+
1313
+ for phrase in problematic_phrases:
1314
+ cleaned_text = re.sub(phrase, '', cleaned_text, flags=re.IGNORECASE)
1315
+
1316
+ # 清理多餘空格和標點
1317
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
1318
+ cleaned_text = re.sub(r'\s*,\s*,\s*', ', ', cleaned_text)
1319
+ cleaned_text = re.sub(r'\s*\.\s*\.\s*', '. ', cleaned_text)
1320
+
1321
+ return cleaned_text.strip()
1322
+
1323
+ # 階段4:多段落處理 - 篩選內容段落
1324
  content_paragraphs = []
1325
+
1326
  for paragraph in paragraphs:
1327
+ is_explanatory = False
1328
+
1329
+ # 檢查是否為解釋性段落
1330
  for pattern in note_patterns:
1331
+ if re.search(pattern, paragraph, flags=re.IGNORECASE | re.MULTILINE):
1332
+ is_explanatory = True
1333
+ if debug:
1334
+ print(f"Removed explanatory paragraph: {paragraph[:50]}...")
1335
  break
1336
+
1337
+ # 檢查常見的解釋性開頭
1338
+ explanatory_starters = [
1339
+ 'note:', 'please note:', 'remember:', 'i have followed',
1340
+ 'i have adhered', 'i have strictly', 'additionally, i',
1341
+ 'this description follows', 'the enhanced description',
1342
+ 'i\'ve maintained', 'i\'ve preserved', 'i\'ve ensured'
1343
+ ]
1344
+
1345
+ for starter in explanatory_starters:
1346
+ if paragraph.lower().startswith(starter):
1347
+ is_explanatory = True
1348
+ if debug:
1349
+ print(f"Removed paragraph starting with '{starter}': {paragraph[:50]}...")
1350
+ break
1351
+
1352
+ # 檢查是否包含過多的規則相關詞彙
1353
+ rule_keywords = ['adherence', 'whitelist', 'accuracy rule', 'assumptions',
1354
+ 'inferences', 'speculation', 'transitional phrases']
1355
+ keyword_count = sum(1 for keyword in rule_keywords if keyword in paragraph.lower())
1356
+
1357
+ if keyword_count >= 2: # 如果包含2個以上規則關鍵詞,視為解釋性段落
1358
+ is_explanatory = True
1359
+ if debug:
1360
+ print(f"Removed rule-heavy paragraph: {paragraph[:50]}...")
1361
+
1362
+ # 保留非解釋性段落
1363
+ if not is_explanatory:
1364
  content_paragraphs.append(paragraph)
1365
+
1366
+ # 階段5:重新組合段落
1367
+ if content_paragraphs:
1368
+ result = '\n\n'.join(content_paragraphs).strip()
1369
+ else:
1370
+ # 如果所有段落都被移除,嘗試保留最長的段落並進行基本清理
1371
+ if paragraphs:
1372
+ longest_para = max(paragraphs, key=len)
1373
+ result = re.sub(r'(?:Note:.*?\.)|(?:\([^)]*rule[^)]*\))', '', longest_para, flags=re.IGNORECASE)
1374
+ result = re.sub(r'\s+', ' ', result).strip()
1375
+ else:
1376
+ result = ""
1377
+
1378
+ # 階段6:最終清理
1379
+ if result:
1380
+ # 移除可能殘留的解釋性片段
1381
+ result = re.sub(r'\s*,?\s*avoiding\s+any\s+(?:assumptions|speculation).*?(?=\.|$)', '', result, flags=re.IGNORECASE)
1382
+ result = re.sub(r'\s*,?\s*using\s+(?:transitional\s+phrases|clear\s+and\s+concise).*?(?=\.|$)', '', result, flags=re.IGNORECASE)
1383
+
1384
+ # 標準化標點符號和空格
1385
+ result = re.sub(r'\s+', ' ', result)
1386
+ result = re.sub(r'\s*([,.!?])\s*', r'\1 ', result)
1387
+ result = re.sub(r'\s+([,.!?])', r'\1', result)
1388
+ result = result.strip()
1389
+
1390
+ if debug:
1391
+ print(f"Cleaning completed. Original length: {len(original_response)}, Final length: {len(result)}")
1392
+
1393
+ return result if result else original_response
1394
+
1395
  except Exception as e:
1396
+ # 如果處理過程中發生錯誤,返回原始文字
1397
+ if debug:
1398
+ print(f"Error during cleaning: {str(e)}")
1399
+ return original_response
1400
 
1401
  def get_processor_info(self) -> Dict[str, Any]:
1402
  """