当无法正常切割PDF文档时,强制切割
Browse files- crazy_functions/crazy_utils.py +31 -10
- request_llm/bridge_chatgpt.py +1 -1
crazy_functions/crazy_utils.py
CHANGED
@@ -104,7 +104,7 @@ def request_gpt_model_in_new_thread_with_ui_alive(
|
|
104 |
mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
|
105 |
if retry_op > 0:
|
106 |
retry_op -= 1
|
107 |
-
mutable[0] += f"[Local Message]
|
108 |
if "Rate limit reached" in tb_str:
|
109 |
time.sleep(30)
|
110 |
time.sleep(5)
|
@@ -312,7 +312,6 @@ def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
|
|
312 |
if get_token_fn(prev) < limit:
|
313 |
break
|
314 |
if cnt == 0:
|
315 |
-
print('what the fuck ?')
|
316 |
raise RuntimeError("存在一行极长的文本!")
|
317 |
# print(len(post))
|
318 |
# 列表递归接龙
|
@@ -325,8 +324,18 @@ def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
|
|
325 |
return cut(txt, must_break_at_empty_line=False)
|
326 |
|
327 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
329 |
-
|
|
|
330 |
if get_token_fn(txt_tocut) <= limit:
|
331 |
return [txt_tocut]
|
332 |
else:
|
@@ -338,28 +347,40 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
|
338 |
if must_break_at_empty_line:
|
339 |
if lines[cnt] != "":
|
340 |
continue
|
341 |
-
print(cnt)
|
342 |
prev = "\n".join(lines[:cnt])
|
343 |
post = "\n".join(lines[cnt:])
|
344 |
if get_token_fn(prev) < limit:
|
345 |
break
|
346 |
if cnt == 0:
|
347 |
-
|
348 |
-
|
|
|
|
|
349 |
# print(len(post))
|
350 |
# 列表递归接龙
|
351 |
result = [prev]
|
352 |
-
result.extend(cut(post, must_break_at_empty_line))
|
353 |
return result
|
354 |
try:
|
|
|
355 |
return cut(txt, must_break_at_empty_line=True)
|
356 |
except RuntimeError:
|
357 |
try:
|
|
|
358 |
return cut(txt, must_break_at_empty_line=False)
|
359 |
except RuntimeError:
|
360 |
-
|
361 |
-
|
362 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
|
364 |
|
365 |
|
|
|
104 |
mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
|
105 |
if retry_op > 0:
|
106 |
retry_op -= 1
|
107 |
+
mutable[0] += f"[Local Message] 重试中,请稍等 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}:\n\n"
|
108 |
if "Rate limit reached" in tb_str:
|
109 |
time.sleep(30)
|
110 |
time.sleep(5)
|
|
|
312 |
if get_token_fn(prev) < limit:
|
313 |
break
|
314 |
if cnt == 0:
|
|
|
315 |
raise RuntimeError("存在一行极长的文本!")
|
316 |
# print(len(post))
|
317 |
# 列表递归接龙
|
|
|
324 |
return cut(txt, must_break_at_empty_line=False)
|
325 |
|
326 |
|
327 |
+
def force_breakdown(txt, limit, get_token_fn):
|
328 |
+
"""
|
329 |
+
当无法用标点、空行分割时,我们用最暴力的方法切割
|
330 |
+
"""
|
331 |
+
for i in reversed(range(len(txt))):
|
332 |
+
if get_token_fn(txt[:i]) < limit:
|
333 |
+
return txt[:i], txt[i:]
|
334 |
+
return "Tiktoken未知错误", "Tiktoken未知错误"
|
335 |
+
|
336 |
def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
337 |
+
# 递归
|
338 |
+
def cut(txt_tocut, must_break_at_empty_line, break_anyway=False):
|
339 |
if get_token_fn(txt_tocut) <= limit:
|
340 |
return [txt_tocut]
|
341 |
else:
|
|
|
347 |
if must_break_at_empty_line:
|
348 |
if lines[cnt] != "":
|
349 |
continue
|
|
|
350 |
prev = "\n".join(lines[:cnt])
|
351 |
post = "\n".join(lines[cnt:])
|
352 |
if get_token_fn(prev) < limit:
|
353 |
break
|
354 |
if cnt == 0:
|
355 |
+
if break_anyway:
|
356 |
+
prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
|
357 |
+
else:
|
358 |
+
raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
|
359 |
# print(len(post))
|
360 |
# 列表递归接龙
|
361 |
result = [prev]
|
362 |
+
result.extend(cut(post, must_break_at_empty_line, break_anyway=break_anyway))
|
363 |
return result
|
364 |
try:
|
365 |
+
# 第1次尝试,将双空行(\n\n)作为切分点
|
366 |
return cut(txt, must_break_at_empty_line=True)
|
367 |
except RuntimeError:
|
368 |
try:
|
369 |
+
# 第2次尝试,将单空行(\n)作为切分点
|
370 |
return cut(txt, must_break_at_empty_line=False)
|
371 |
except RuntimeError:
|
372 |
+
try:
|
373 |
+
# 第3次尝试,将英文句号(.)作为切分点
|
374 |
+
res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在
|
375 |
+
return [r.replace('。\n', '.') for r in res]
|
376 |
+
except RuntimeError as e:
|
377 |
+
try:
|
378 |
+
# 第4次尝试,将中文句号(。)作为切分点
|
379 |
+
res = cut(txt.replace('。', '。。\n'), must_break_at_empty_line=False)
|
380 |
+
return [r.replace('。。\n', '。') for r in res]
|
381 |
+
except RuntimeError as e:
|
382 |
+
# 第5次尝试,没办法了,随便切一下敷衍吧
|
383 |
+
return cut(txt, must_break_at_empty_line=False, break_anyway=True)
|
384 |
|
385 |
|
386 |
|
request_llm/bridge_chatgpt.py
CHANGED
@@ -96,7 +96,7 @@ def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="",
|
|
96 |
# 看门狗,如果超过期限没有喂狗,则终止
|
97 |
if len(observe_window) >= 2:
|
98 |
if (time.time()-observe_window[1]) > watch_dog_patience:
|
99 |
-
raise RuntimeError("
|
100 |
else: raise RuntimeError("意外Json结构:"+delta)
|
101 |
if json_data['finish_reason'] == 'length':
|
102 |
raise ConnectionAbortedError("正常结束,但显示Token不足,导致输出不完整,请削减单次输入的文本量。")
|
|
|
96 |
# 看门狗,如果超过期限没有喂狗,则终止
|
97 |
if len(observe_window) >= 2:
|
98 |
if (time.time()-observe_window[1]) > watch_dog_patience:
|
99 |
+
raise RuntimeError("用户取消了程序。")
|
100 |
else: raise RuntimeError("意外Json结构:"+delta)
|
101 |
if json_data['finish_reason'] == 'length':
|
102 |
raise ConnectionAbortedError("正常结束,但显示Token不足,导致输出不完整,请削减单次输入的文本量。")
|