Spaces:
Running
on
Zero
Running
on
Zero
Update response_processor.py
Browse files- response_processor.py +164 -34
response_processor.py
CHANGED
@@ -1218,55 +1218,185 @@ class ResponseProcessor:
|
|
1218 |
if not re.search(r'[.!?]', response):
|
1219 |
raise ResponseProcessingError("Response lacks proper sentence structure")
|
1220 |
|
1221 |
-
def remove_explanatory_notes(
|
1222 |
"""
|
1223 |
-
|
1224 |
-
|
1225 |
Args:
|
1226 |
-
response:
|
1227 |
-
|
|
|
1228 |
Returns:
|
1229 |
-
str:
|
1230 |
"""
|
|
|
|
|
|
|
|
|
|
|
1231 |
try:
|
1232 |
-
#
|
1233 |
note_patterns = [
|
1234 |
-
|
1235 |
-
r'(?:^|\n)
|
1236 |
-
|
1237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1238 |
]
|
1239 |
-
|
1240 |
-
#
|
1241 |
paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
|
1242 |
-
|
1243 |
-
|
1244 |
-
|
|
|
|
|
|
|
|
|
1245 |
for pattern in note_patterns:
|
1246 |
-
|
1247 |
-
|
1248 |
-
|
1249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1250 |
content_paragraphs = []
|
|
|
1251 |
for paragraph in paragraphs:
|
1252 |
-
|
|
|
|
|
1253 |
for pattern in note_patterns:
|
1254 |
-
if re.search(pattern, paragraph, flags=re.IGNORECASE):
|
1255 |
-
|
|
|
|
|
1256 |
break
|
1257 |
-
|
1258 |
-
#
|
1259 |
-
|
1260 |
-
|
1261 |
-
|
1262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1263 |
content_paragraphs.append(paragraph)
|
1264 |
-
|
1265 |
-
|
1266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1267 |
except Exception as e:
|
1268 |
-
|
1269 |
-
|
|
|
|
|
1270 |
|
1271 |
def get_processor_info(self) -> Dict[str, Any]:
|
1272 |
"""
|
|
|
1218 |
if not re.search(r'[.!?]', response):
|
1219 |
raise ResponseProcessingError("Response lacks proper sentence structure")
|
1220 |
|
1221 |
+
def remove_explanatory_notes(response: str, debug: bool = False) -> str:
|
1222 |
"""
|
1223 |
+
移除 LLM 回應中的解釋性注釋和內部處理文字
|
1224 |
+
|
1225 |
Args:
|
1226 |
+
response: 包含可能注釋的回應文字
|
1227 |
+
debug: 是否顯示除錯資訊
|
1228 |
+
|
1229 |
Returns:
|
1230 |
+
str: 移除注釋後的清理文字
|
1231 |
"""
|
1232 |
+
if not response or not response.strip():
|
1233 |
+
return response
|
1234 |
+
|
1235 |
+
original_response = response
|
1236 |
+
|
1237 |
try:
|
1238 |
+
# 階段1:移除明確的注釋段落模式
|
1239 |
note_patterns = [
|
1240 |
+
# Note: 開頭的句子
|
1241 |
+
r'(?:^|\n)\s*Note\s*:.*?(?:\n|$)',
|
1242 |
+
|
1243 |
+
# "I have" 開頭的解釋句
|
1244 |
+
r'(?:^|\n)\s*I\s+have\s+(?:followed|adhered\s+to|ensured|strictly\s+adhered\s+to|also\s+followed).*?(?:\n|$)',
|
1245 |
+
|
1246 |
+
# "This description" 開頭的說明
|
1247 |
+
r'(?:^|\n)\s*This\s+description\s+(?:follows|adheres\s+to|maintains).*?(?:\n|$)',
|
1248 |
+
|
1249 |
+
# "The enhanced description" 開頭的說明
|
1250 |
+
r'(?:^|\n)\s*The\s+enhanced\s+description\s+(?:maintains|preserves).*?(?:\n|$)',
|
1251 |
+
|
1252 |
+
# "Additionally, I have" 模式
|
1253 |
+
r'(?:^|\n)\s*Additionally,?\s*I\s+have.*?(?:\n|$)',
|
1254 |
+
|
1255 |
+
# "I've" 開頭的解釋
|
1256 |
+
r'(?:^|\n)\s*I\'ve\s+(?:maintained|preserved|ensured|avoided).*?(?:\n|$)',
|
1257 |
+
|
1258 |
+
# "Please note" 開頭
|
1259 |
+
r'(?:^|\n)\s*Please\s+note.*?(?:\n|$)',
|
1260 |
+
|
1261 |
+
# "Remember" 開頭
|
1262 |
+
r'(?:^|\n)\s*Remember.*?(?:\n|$)',
|
1263 |
+
|
1264 |
+
# 括號內的解釋
|
1265 |
+
r'\([^)]*(?:adhered|followed|rule|accuracy|speculation)[^)]*\)',
|
1266 |
+
|
1267 |
+
# "avoiding any assumptions" 相關
|
1268 |
+
r'(?:^|\n).*?avoiding\s+any\s+(?:assumptions|inferences|speculation).*?(?:\n|$)',
|
1269 |
+
|
1270 |
+
# "object whitelist" 相關
|
1271 |
+
r'(?:^|\n).*?object\s+whitelist.*?(?:\n|$)',
|
1272 |
+
|
1273 |
+
# "detail accuracy rule" 相關
|
1274 |
+
r'(?:^|\n).*?detail\s+accuracy\s+rule.*?(?:\n|$)',
|
1275 |
+
|
1276 |
+
# "critical adherence" 相關
|
1277 |
+
r'(?:^|\n).*?critical\s+adherence.*?(?:\n|$)',
|
1278 |
+
|
1279 |
+
# "transitional phrases" 相關
|
1280 |
+
r'(?:^|\n).*?transitional\s+phrases.*?(?:\n|$)',
|
1281 |
+
|
1282 |
+
# "varying sentence structures" 相關
|
1283 |
+
r'(?:^|\n).*?varying\s+sentence\s+structures.*?(?:\n|$)',
|
1284 |
+
|
1285 |
+
# "natural flow" 相關
|
1286 |
+
r'(?:^|\n).*?natural\s+flow.*?(?:\n|$)',
|
1287 |
+
|
1288 |
+
# 長句形式的規則說明
|
1289 |
+
r'(?:^|\n).*?(?:focused\s+on\s+describing|clear\s+and\s+concise\s+manner).*?(?:\n|$)'
|
1290 |
]
|
1291 |
+
|
1292 |
+
# 階段2:處理段落分割
|
1293 |
paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
|
1294 |
+
|
1295 |
+
if debug:
|
1296 |
+
print(f"Original paragraphs count: {len(paragraphs)}")
|
1297 |
+
|
1298 |
+
# 階段3:如果只有一個段落,進行內部清理
|
1299 |
+
if len(paragraphs) <= 1:
|
1300 |
+
cleaned_text = response
|
1301 |
for pattern in note_patterns:
|
1302 |
+
cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE | re.MULTILINE)
|
1303 |
+
|
1304 |
+
# 額外清理常見的問題短語
|
1305 |
+
problematic_phrases = [
|
1306 |
+
r'\b(?:Note\s+that\s+)?I\s+have\s+strictly\s+adhered\s+to.*?\.?',
|
1307 |
+
r'\b(?:Additionally,?\s*)?I\s+have\s+followed.*?\.?',
|
1308 |
+
r'\b(?:I\s+have\s+)?avoided\s+(?:any\s+)?(?:assumptions|speculation).*?\.?',
|
1309 |
+
r'\busing\s+transitional\s+phrases.*?\.?',
|
1310 |
+
r'\bcreate\s+a\s+natural\s+flow.*?\.?'
|
1311 |
+
]
|
1312 |
+
|
1313 |
+
for phrase in problematic_phrases:
|
1314 |
+
cleaned_text = re.sub(phrase, '', cleaned_text, flags=re.IGNORECASE)
|
1315 |
+
|
1316 |
+
# 清理多餘空格和標點
|
1317 |
+
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
|
1318 |
+
cleaned_text = re.sub(r'\s*,\s*,\s*', ', ', cleaned_text)
|
1319 |
+
cleaned_text = re.sub(r'\s*\.\s*\.\s*', '. ', cleaned_text)
|
1320 |
+
|
1321 |
+
return cleaned_text.strip()
|
1322 |
+
|
1323 |
+
# 階段4:多段落處理 - 篩選內容段落
|
1324 |
content_paragraphs = []
|
1325 |
+
|
1326 |
for paragraph in paragraphs:
|
1327 |
+
is_explanatory = False
|
1328 |
+
|
1329 |
+
# 檢查是否為解釋性段落
|
1330 |
for pattern in note_patterns:
|
1331 |
+
if re.search(pattern, paragraph, flags=re.IGNORECASE | re.MULTILINE):
|
1332 |
+
is_explanatory = True
|
1333 |
+
if debug:
|
1334 |
+
print(f"Removed explanatory paragraph: {paragraph[:50]}...")
|
1335 |
break
|
1336 |
+
|
1337 |
+
# 檢查常見的解釋性開頭
|
1338 |
+
explanatory_starters = [
|
1339 |
+
'note:', 'please note:', 'remember:', 'i have followed',
|
1340 |
+
'i have adhered', 'i have strictly', 'additionally, i',
|
1341 |
+
'this description follows', 'the enhanced description',
|
1342 |
+
'i\'ve maintained', 'i\'ve preserved', 'i\'ve ensured'
|
1343 |
+
]
|
1344 |
+
|
1345 |
+
for starter in explanatory_starters:
|
1346 |
+
if paragraph.lower().startswith(starter):
|
1347 |
+
is_explanatory = True
|
1348 |
+
if debug:
|
1349 |
+
print(f"Removed paragraph starting with '{starter}': {paragraph[:50]}...")
|
1350 |
+
break
|
1351 |
+
|
1352 |
+
# 檢查是否包含過多的規則相關詞彙
|
1353 |
+
rule_keywords = ['adherence', 'whitelist', 'accuracy rule', 'assumptions',
|
1354 |
+
'inferences', 'speculation', 'transitional phrases']
|
1355 |
+
keyword_count = sum(1 for keyword in rule_keywords if keyword in paragraph.lower())
|
1356 |
+
|
1357 |
+
if keyword_count >= 2: # 如果包含2個以上規則關鍵詞,視為解釋性段落
|
1358 |
+
is_explanatory = True
|
1359 |
+
if debug:
|
1360 |
+
print(f"Removed rule-heavy paragraph: {paragraph[:50]}...")
|
1361 |
+
|
1362 |
+
# 保留非解釋性段落
|
1363 |
+
if not is_explanatory:
|
1364 |
content_paragraphs.append(paragraph)
|
1365 |
+
|
1366 |
+
# 階段5:重新組合段落
|
1367 |
+
if content_paragraphs:
|
1368 |
+
result = '\n\n'.join(content_paragraphs).strip()
|
1369 |
+
else:
|
1370 |
+
# 如果所有段落都被移除,嘗試保留最長的段落並進行基本清理
|
1371 |
+
if paragraphs:
|
1372 |
+
longest_para = max(paragraphs, key=len)
|
1373 |
+
result = re.sub(r'(?:Note:.*?\.)|(?:\([^)]*rule[^)]*\))', '', longest_para, flags=re.IGNORECASE)
|
1374 |
+
result = re.sub(r'\s+', ' ', result).strip()
|
1375 |
+
else:
|
1376 |
+
result = ""
|
1377 |
+
|
1378 |
+
# 階段6:最終清理
|
1379 |
+
if result:
|
1380 |
+
# 移除可能殘留的解釋性片段
|
1381 |
+
result = re.sub(r'\s*,?\s*avoiding\s+any\s+(?:assumptions|speculation).*?(?=\.|$)', '', result, flags=re.IGNORECASE)
|
1382 |
+
result = re.sub(r'\s*,?\s*using\s+(?:transitional\s+phrases|clear\s+and\s+concise).*?(?=\.|$)', '', result, flags=re.IGNORECASE)
|
1383 |
+
|
1384 |
+
# 標準化標點符號和空格
|
1385 |
+
result = re.sub(r'\s+', ' ', result)
|
1386 |
+
result = re.sub(r'\s*([,.!?])\s*', r'\1 ', result)
|
1387 |
+
result = re.sub(r'\s+([,.!?])', r'\1', result)
|
1388 |
+
result = result.strip()
|
1389 |
+
|
1390 |
+
if debug:
|
1391 |
+
print(f"Cleaning completed. Original length: {len(original_response)}, Final length: {len(result)}")
|
1392 |
+
|
1393 |
+
return result if result else original_response
|
1394 |
+
|
1395 |
except Exception as e:
|
1396 |
+
# 如果處理過程中發生錯誤,返回原始文字
|
1397 |
+
if debug:
|
1398 |
+
print(f"Error during cleaning: {str(e)}")
|
1399 |
+
return original_response
|
1400 |
|
1401 |
def get_processor_info(self) -> Dict[str, Any]:
|
1402 |
"""
|