DawnC commited on
Commit
1c7033a
·
verified ·
1 Parent(s): f7ca18f

Update response_processor.py

Browse files
Files changed (1) hide show
  1. response_processor.py +81 -69
response_processor.py CHANGED
@@ -1220,7 +1220,7 @@ class ResponseProcessor:
1220
 
1221
  def remove_explanatory_notes(self, response: str) -> str:
1222
  """
1223
- 移除解釋性注釋和說明,特別處理破碎的解釋性片段
1224
 
1225
  Args:
1226
  response: 包含可能注釋的回應
@@ -1232,66 +1232,80 @@ class ResponseProcessor:
1232
  if not response or not response.strip():
1233
  return response
1234
 
1235
- # 第一步:移除完整的解釋性句子片段
1236
- problematic_fragments = [
1237
- # 處理完整的 "Note that I have..." 破碎句型
1238
- r'Note\s+that\s+I\s+have\s*,?\s*avoiding\s+any\s+assumptions.*?natural\s+flow\.?',
1239
-
1240
- # 處理 "avoiding any assumptions I have also" 片段
1241
- r'\bavoiding\s+any\s+assumptions\s+I\s+have\s+also\s+and\s+detail\s+accuracy\s+rule.*?\.?',
1242
 
1243
- # 處理 "and their locations. Additionally, I have" 片段
1244
- r'\band\s+their\s+locations\.\s*Additionally,?\s*I\s+have\s+and\s+have\s+focused.*?\.?',
1245
 
1246
- # 處理 "using transitional phrases..." 片段
1247
- r'\busing\s+transitional\s+phrases\s+and\s+varying\s+sentence\s+structures\s+to\s+create\s+a\s+natural\s+flow\.?',
 
 
 
 
 
 
 
 
 
 
1248
 
1249
- # 處理 "strictly adhered to..." 相關片段
1250
- r'\bstrictly\s+adhered\s+to\s+the\s+(?:critical\s+adherence\s+to\s+input\s+rule|or\s+inferences\s+beyond\s+the\s+explicitly\s+provided\s+information)\.?',
1251
 
1252
- # 處理 "or inferences beyond..." 片段
1253
- r'\bor\s+inferences\s+beyond\s+the\s+explicitly\s+provided\s+information\.?',
1254
 
1255
- # 處理 "the mentioning only..." 片段
1256
- r'\bthe\s+mentioning\s+only\s+the\s+objects\s+and\s+their\s+locations\.?',
1257
 
1258
- # 處理 "avoided speculating..." 片段
1259
- r'\bavoided\s+speculating\s+on\s+object\s+quantities,?\s*spatial\s+relationships,?\s*and\s+atmospheres,?\.?',
1260
 
1261
- # 處理 "and detail accuracy rule" 片段
1262
- r'\band\s+detail\s+accuracy\s+rule,?\s*and\s+their\s+locations\.?',
1263
 
1264
- # 處理更一般的解釋性片段
1265
- r'\b(?:have\s+)?strictly\s+adhered\s+to.*?(?:information|rule)\.?',
1266
- r'\b(?:have\s+)?followed\s+the.*?(?:whitelist|rule)\.?',
1267
- r'\b(?:have\s+)?avoided\s+(?:any\s+)?(?:assumptions|speculation).*?\.?',
1268
- r'\bmentioning\s+only\s+the\s+objects.*?\.?',
1269
 
1270
- # 處理孤立的片段詞組
1271
- r'\bthe\s+mentioning\s+only\b',
1272
- r'\bavoided\s+speculating\b',
1273
- r'\bstrictly\s+adhered\s+to\s+the\b',
1274
- r'\bor\s+inferences\s+beyond\b',
1275
- r'\band\s+detail\s+accuracy\s+rule\b',
1276
- r'\bAdditionally,?\s*I\s+have\s+and\s+have\s+focused\b',
1277
- r'\bclear\s+and\s+concise\s+manner,?\s*using\s+transitional\s+phrases\b',
 
 
 
 
 
1278
  ]
1279
 
1280
- cleaned_response = response
1281
- for pattern in problematic_fragments:
1282
- cleaned_response = re.sub(pattern, '', cleaned_response, flags=re.IGNORECASE)
1283
 
1284
- # 第二步:清理標點符號問題
1285
  # 移除多餘的逗號和句號
1286
- cleaned_response = re.sub(r'\s*,\s*,+\s*', ', ', cleaned_response)
1287
- cleaned_response = re.sub(r'\s*\.+\s*\.+\s*', '. ', cleaned_response)
1288
- cleaned_response = re.sub(r'\s*,\s*\.\s*', '. ', cleaned_response)
 
 
 
 
1289
 
1290
- # 修復句子結尾的孤立標點
1291
- cleaned_response = re.sub(r'\s+,\s*$', '.', cleaned_response)
1292
- cleaned_response = re.sub(r'\s+,\s*(?=\s+[A-Z])', '. ', cleaned_response)
1293
 
1294
- # 第三步:傳統的段落級處理
1295
  traditional_note_patterns = [
1296
  r'(?:^|\n)Note:.*?(?:\n|$)',
1297
  r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
@@ -1299,16 +1313,13 @@ class ResponseProcessor:
1299
  r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
1300
  ]
1301
 
1302
- # 尋找段落
1303
- paragraphs = [p.strip() for p in cleaned_response.split('\n\n') if p.strip()]
1304
 
1305
- # 如果只有一個段落,檢查並清理它
1306
  if len(paragraphs) == 1:
1307
  for pattern in traditional_note_patterns:
1308
  paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
1309
  result = paragraphs[0].strip()
1310
  else:
1311
- # 如果有多個段落,移除注釋段落
1312
  content_paragraphs = []
1313
  for paragraph in paragraphs:
1314
  is_note = False
@@ -1317,7 +1328,6 @@ class ResponseProcessor:
1317
  is_note = True
1318
  break
1319
 
1320
- # 檢查段落是否以常見的注釋詞開頭
1321
  if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
1322
  is_note = True
1323
 
@@ -1326,37 +1336,39 @@ class ResponseProcessor:
1326
 
1327
  result = '\n\n'.join(content_paragraphs).strip()
1328
 
1329
- # 第四步:最終清理和格式化
1330
  if result:
1331
  # 標準化空格
1332
  result = re.sub(r'\s+', ' ', result)
1333
 
1334
- # 修復句子間的間距
1335
- result = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', result)
1336
-
1337
  # 確保句子以適當的標點結尾
1338
  result = result.strip()
1339
  if result and not result.endswith(('.', '!', '?')):
1340
  result += '.'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1341
 
1342
  return result
1343
 
1344
- # 如果結果為空,嘗試更保守的清理
1345
- fallback_result = response
1346
- conservative_patterns = [
1347
- r'\bstrictly\s+adhered\s+to.*?information\.?',
1348
- r'\bavoided\s+speculating.*?atmospheres,?\.?',
1349
- r'\bthe\s+mentioning\s+only.*?locations\.?'
1350
- ]
1351
-
1352
- for pattern in conservative_patterns:
1353
- fallback_result = re.sub(pattern, '', fallback_result, flags=re.IGNORECASE)
1354
-
1355
- fallback_result = re.sub(r'\s+', ' ', fallback_result).strip()
1356
- return fallback_result if fallback_result else response
1357
 
1358
  except Exception as e:
1359
- self.logger.error(f"Failed to remove explanatory notes: {str(e)}")
 
1360
  return response
1361
 
1362
  def get_processor_info(self) -> Dict[str, Any]:
 
1220
 
1221
  def remove_explanatory_notes(self, response: str) -> str:
1222
  """
1223
+ 移除解釋性注釋和說明,採用多階段清理策略處理破碎片段
1224
 
1225
  Args:
1226
  response: 包含可能注釋的回應
 
1232
  if not response or not response.strip():
1233
  return response
1234
 
1235
+ original_response = response
1236
+
1237
+ # 階段1:移除明確的完整問題句型
1238
+ complete_problem_patterns = [
1239
+ # 完整的破碎句型(貪婪匹配)
1240
+ r'Note\s+that\s+I\s+have\s*[,.\s]*.*?(?:natural\s+flow|concise\s+manner)[,.\s]*',
 
1241
 
1242
+ # Note that 開始到句號結束的整個片段
1243
+ r'Note\s+that\s+I\s+have\s*[,.\s]*.*?\.',
1244
 
1245
+ # 處理包含 avoiding assumptions 的整個片段
1246
+ r'[,.\s]*avoiding\s+any\s+assumptions.*?(?:manner|flow|locations)[,.\s]*',
1247
+ ]
1248
+
1249
+ cleaned_text = response
1250
+ for pattern in complete_problem_patterns:
1251
+ cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE | re.DOTALL)
1252
+
1253
+ # 階段2:移除具體的問題關鍵詞組合
1254
+ specific_fragments = [
1255
+ # 移除 "I have also" 相關片段
1256
+ r'\bI\s+have\s+also\s*[,.\s]*(?:and\s+detail\s+accuracy\s+rule\s*[,.\s]*)?',
1257
 
1258
+ # 移除 "and their locations" 孤立片段
1259
+ r'[,.\s]*and\s+their\s+locations[,.\s]*',
1260
 
1261
+ # 移除 "on describing in a clear" 片段
1262
+ r'[,.\s]*on\s+describing\s+in\s+a\s+clear(?:\s+and\s+concise)?(?:\s+manner)?[,.\s]*',
1263
 
1264
+ # 移除 "detail accuracy rule" 相關
1265
+ r'[,.\s]*(?:and\s+)?detail\s+accuracy\s+rule[,.\s]*',
1266
 
1267
+ # 移除孤立的 "avoiding any assumptions"
1268
+ r'[,.\s]*avoiding\s+any\s+assumptions[,.\s]*',
1269
 
1270
+ # 移除 "Additionally, I have" 開頭的破碎片段
1271
+ r'Additionally,?\s*I\s+have\s*[,.\s]*(?:and\s+have\s+focused\s*[,.\s]*)?',
1272
 
1273
+ # 移除 "using transitional phrases" 相關
1274
+ r'[,.\s]*using\s+transitional\s+phrases(?:\s+and\s+varying\s+sentence\s+structures)?[,.\s]*',
 
 
 
1275
 
1276
+ # 移除 "to create a natural flow"
1277
+ r'[,.\s]*to\s+create\s+a\s+natural\s+flow[,.\s]*',
1278
+ ]
1279
+
1280
+ for pattern in specific_fragments:
1281
+ cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)
1282
+
1283
+ # 階段3:移除任何以問題關鍵詞開頭的殘留片段
1284
+ problem_starters = [
1285
+ r'^[,.\s]*Note\s+that.*?[,.\s]*',
1286
+ r'^[,.\s]*I\s+have\s+(?:strictly\s+)?(?:adhered|followed|ensured).*?[,.\s]*',
1287
+ r'^[,.\s]*avoiding\s+any.*?[,.\s]*',
1288
+ r'^[,.\s]*Additionally.*?[,.\s]*',
1289
  ]
1290
 
1291
+ for pattern in problem_starters:
1292
+ cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE | re.MULTILINE)
 
1293
 
1294
+ # 階段4:清理標點符號和格式問題
1295
  # 移除多餘的逗號和句號
1296
+ cleaned_text = re.sub(r'\s*,\s*,+\s*', ', ', cleaned_text)
1297
+ cleaned_text = re.sub(r'\s*\.+\s*\.+\s*', '. ', cleaned_text)
1298
+ cleaned_text = re.sub(r'\s*,\s*\.\s*', '. ', cleaned_text)
1299
+
1300
+ # 移除開頭和結尾的標點符號
1301
+ cleaned_text = re.sub(r'^[,.\s]+', '', cleaned_text)
1302
+ cleaned_text = re.sub(r'[,.\s]+$', '', cleaned_text)
1303
 
1304
+ # 修復句子間的標點問題
1305
+ cleaned_text = re.sub(r'([.!?])\s*,\s*([A-Z])', r'\1 \2', cleaned_text)
1306
+ cleaned_text = re.sub(r',\s*([A-Z])', r'. \1', cleaned_text)
1307
 
1308
+ # 階段5:傳統段落級處理(保持原有邏輯)
1309
  traditional_note_patterns = [
1310
  r'(?:^|\n)Note:.*?(?:\n|$)',
1311
  r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
 
1313
  r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
1314
  ]
1315
 
1316
+ paragraphs = [p.strip() for p in cleaned_text.split('\n\n') if p.strip()]
 
1317
 
 
1318
  if len(paragraphs) == 1:
1319
  for pattern in traditional_note_patterns:
1320
  paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
1321
  result = paragraphs[0].strip()
1322
  else:
 
1323
  content_paragraphs = []
1324
  for paragraph in paragraphs:
1325
  is_note = False
 
1328
  is_note = True
1329
  break
1330
 
 
1331
  if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
1332
  is_note = True
1333
 
 
1336
 
1337
  result = '\n\n'.join(content_paragraphs).strip()
1338
 
1339
+ # 階段6:最終驗證和格式化
1340
  if result:
1341
  # 標準化空格
1342
  result = re.sub(r'\s+', ' ', result)
1343
 
 
 
 
1344
  # 確保句子以適當的標點結尾
1345
  result = result.strip()
1346
  if result and not result.endswith(('.', '!', '?')):
1347
  result += '.'
1348
+
1349
+ # 最終檢查:如果結果太短,使用更保守的方法
1350
+ if len(result.split()) < 5:
1351
+ conservative_result = original_response
1352
+ # 只移除最明顯的問題片段
1353
+ conservative_patterns = [
1354
+ r'Note\s+that\s+I\s+have.*?manner[,.\s]*',
1355
+ r'avoiding\s+any\s+assumptions.*?locations[,.\s]*',
1356
+ r'Additionally,?\s*I\s+have.*?flow[,.\s]*'
1357
+ ]
1358
+ for pattern in conservative_patterns:
1359
+ conservative_result = re.sub(pattern, '', conservative_result, flags=re.IGNORECASE)
1360
+
1361
+ conservative_result = re.sub(r'\s+', ' ', conservative_result).strip()
1362
+ return conservative_result if conservative_result else original_response
1363
 
1364
  return result
1365
 
1366
+ # 如果所有處理後結果為空,返回原始內容
1367
+ return original_response
 
 
 
 
 
 
 
 
 
 
 
1368
 
1369
  except Exception as e:
1370
+ if hasattr(self, 'logger'):
1371
+ self.logger.error(f"Failed to remove explanatory notes: {str(e)}")
1372
  return response
1373
 
1374
  def get_processor_info(self) -> Dict[str, Any]: