Spaces:
Paused
Paused
改善chatpdf的功能
Browse files- crazy_functional.py +0 -7
- crazy_functions/crazy_utils.py +168 -0
- crazy_functions/批量翻译PDF文档_多线程.py +1 -166
- crazy_functions/理解PDF文档内容.py +56 -132
- version +2 -2
crazy_functional.py
CHANGED
@@ -76,7 +76,6 @@ def get_crazy_functions():
|
|
76 |
from crazy_functions.总结word文档 import 总结word文档
|
77 |
from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
|
78 |
from crazy_functions.谷歌检索小助手 import 谷歌检索小助手
|
79 |
-
from crazy_functions.理解PDF文档内容 import 理解PDF文档内容
|
80 |
from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入
|
81 |
from crazy_functions.Latex全文润色 import Latex中文润色
|
82 |
from crazy_functions.Latex全文翻译 import Latex中译英
|
@@ -108,11 +107,6 @@ def get_crazy_functions():
|
|
108 |
"Color": "stop",
|
109 |
"Function": HotReload(总结word文档)
|
110 |
},
|
111 |
-
# "[测试功能] 理解PDF文档内容(Tk文件选择接口,仅本地)": {
|
112 |
-
# # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
|
113 |
-
# "AsButton": False, # 加入下拉菜单中
|
114 |
-
# "Function": HotReload(理解PDF文档内容)
|
115 |
-
# },
|
116 |
"[测试功能] 理解PDF文档内容(通用接口,读取文件输入区)": {
|
117 |
# HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
|
118 |
"Color": "stop",
|
@@ -131,7 +125,6 @@ def get_crazy_functions():
|
|
131 |
"AsButton": False, # 加入下拉菜单中
|
132 |
"Function": HotReload(Latex中文润色)
|
133 |
},
|
134 |
-
|
135 |
"[测试功能] Latex项目全文中译英(输入路径或上传压缩包)": {
|
136 |
# HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
|
137 |
"Color": "stop",
|
|
|
76 |
from crazy_functions.总结word文档 import 总结word文档
|
77 |
from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
|
78 |
from crazy_functions.谷歌检索小助手 import 谷歌检索小助手
|
|
|
79 |
from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入
|
80 |
from crazy_functions.Latex全文润色 import Latex中文润色
|
81 |
from crazy_functions.Latex全文翻译 import Latex中译英
|
|
|
107 |
"Color": "stop",
|
108 |
"Function": HotReload(总结word文档)
|
109 |
},
|
|
|
|
|
|
|
|
|
|
|
110 |
"[测试功能] 理解PDF文档内容(通用接口,读取文件输入区)": {
|
111 |
# HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
|
112 |
"Color": "stop",
|
|
|
125 |
"AsButton": False, # 加入下拉菜单中
|
126 |
"Function": HotReload(Latex中文润色)
|
127 |
},
|
|
|
128 |
"[测试功能] Latex项目全文中译英(输入路径或上传压缩包)": {
|
129 |
# HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
|
130 |
"Color": "stop",
|
crazy_functions/crazy_utils.py
CHANGED
@@ -360,3 +360,171 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
|
360 |
# 这个中文的句号是故意的,作为一个标识而存在
|
361 |
res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False)
|
362 |
return [r.replace('。\n', '.') for r in res]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
360 |
# 这个中文的句号是故意的,作为一个标识而存在
|
361 |
res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False)
|
362 |
return [r.replace('。\n', '.') for r in res]
|
363 |
+
|
364 |
+
|
365 |
+
|
366 |
+
def read_and_clean_pdf_text(fp):
|
367 |
+
"""
|
368 |
+
这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好
|
369 |
+
|
370 |
+
**输入参数说明**
|
371 |
+
- `fp`:需要读取和清理文本的pdf文件路径
|
372 |
+
|
373 |
+
**输出参数说明**
|
374 |
+
- `meta_txt`:清理后的文本内容字符串
|
375 |
+
- `page_one_meta`:第一页清理后的文本内容列表
|
376 |
+
|
377 |
+
**函数功能**
|
378 |
+
读取pdf文件并清理其中的文本内容,清理规则包括:
|
379 |
+
- 提取所有块元的文本信息,并合并为一个字符串
|
380 |
+
- 去除短块(字符数小于100)并替换为回车符
|
381 |
+
- 清理多余的空行
|
382 |
+
- 合并小写字母开头的段落块并替换为空格
|
383 |
+
- 清除重复的换行
|
384 |
+
- 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔
|
385 |
+
"""
|
386 |
+
import fitz, copy
|
387 |
+
import re
|
388 |
+
import numpy as np
|
389 |
+
from colorful import print亮黄, print亮绿
|
390 |
+
fc = 0
|
391 |
+
fs = 1
|
392 |
+
fb = 2
|
393 |
+
REMOVE_FOOT_NOTE = True
|
394 |
+
REMOVE_FOOT_FFSIZE_PERCENT = 0.95
|
395 |
+
def primary_ffsize(l):
|
396 |
+
fsize_statiscs = {}
|
397 |
+
for wtf in l['spans']:
|
398 |
+
if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
|
399 |
+
fsize_statiscs[wtf['size']] += len(wtf['text'])
|
400 |
+
return max(fsize_statiscs, key=fsize_statiscs.get)
|
401 |
+
|
402 |
+
def ffsize_same(a,b):
|
403 |
+
return abs((a-b)/max(a,b)) < 0.02
|
404 |
+
# file_content = ""
|
405 |
+
with fitz.open(fp) as doc:
|
406 |
+
meta_txt = []
|
407 |
+
meta_font = []
|
408 |
+
|
409 |
+
meta_line = []
|
410 |
+
meta_span = []
|
411 |
+
for index, page in enumerate(doc):
|
412 |
+
# file_content += page.get_text()
|
413 |
+
text_areas = page.get_text("dict") # 获取页面上的文本信息
|
414 |
+
for t in text_areas['blocks']:
|
415 |
+
if 'lines' in t:
|
416 |
+
pf = 998
|
417 |
+
for l in t['lines']:
|
418 |
+
txt_line = "".join([wtf['text'] for wtf in l['spans']])
|
419 |
+
pf = primary_ffsize(l)
|
420 |
+
meta_line.append([txt_line, pf, l['bbox'], l])
|
421 |
+
for wtf in l['spans']: # for l in t['lines']:
|
422 |
+
meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
|
423 |
+
# meta_line.append(["NEW_BLOCK", pf])
|
424 |
+
# 块元提取 for each word segment with in line for each line cross-line words for each block
|
425 |
+
meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
|
426 |
+
'- ', '') for t in text_areas['blocks'] if 'lines' in t])
|
427 |
+
meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
|
428 |
+
for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
|
429 |
+
if index == 0:
|
430 |
+
page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
|
431 |
+
'- ', '') for t in text_areas['blocks'] if 'lines' in t]
|
432 |
+
# 获取正文主字体
|
433 |
+
fsize_statiscs = {}
|
434 |
+
for span in meta_span:
|
435 |
+
if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
|
436 |
+
fsize_statiscs[span[1]] += span[2]
|
437 |
+
main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
|
438 |
+
if REMOVE_FOOT_NOTE:
|
439 |
+
give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
|
440 |
+
|
441 |
+
# 切分和重新整合
|
442 |
+
mega_sec = []
|
443 |
+
sec = []
|
444 |
+
for index, line in enumerate(meta_line):
|
445 |
+
if index == 0:
|
446 |
+
sec.append(line[fc])
|
447 |
+
continue
|
448 |
+
if REMOVE_FOOT_NOTE:
|
449 |
+
if meta_line[index][fs] <= give_up_fize_threshold:
|
450 |
+
continue
|
451 |
+
if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
|
452 |
+
# 尝试识别段落
|
453 |
+
if meta_line[index][fc].endswith('.') and\
|
454 |
+
(meta_line[index-1][fc] != 'NEW_BLOCK') and \
|
455 |
+
(meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
|
456 |
+
sec[-1] += line[fc]
|
457 |
+
sec[-1] += "\n\n"
|
458 |
+
else:
|
459 |
+
sec[-1] += " "
|
460 |
+
sec[-1] += line[fc]
|
461 |
+
else:
|
462 |
+
if (index+1 < len(meta_line)) and \
|
463 |
+
meta_line[index][fs] > main_fsize:
|
464 |
+
# 单行 + 字体大
|
465 |
+
mega_sec.append(copy.deepcopy(sec))
|
466 |
+
sec = []
|
467 |
+
sec.append("# " + line[fc])
|
468 |
+
else:
|
469 |
+
# 尝试识别section
|
470 |
+
if meta_line[index-1][fs] > meta_line[index][fs]:
|
471 |
+
sec.append("\n" + line[fc])
|
472 |
+
else:
|
473 |
+
sec.append(line[fc])
|
474 |
+
mega_sec.append(copy.deepcopy(sec))
|
475 |
+
|
476 |
+
finals = []
|
477 |
+
for ms in mega_sec:
|
478 |
+
final = " ".join(ms)
|
479 |
+
final = final.replace('- ', ' ')
|
480 |
+
finals.append(final)
|
481 |
+
meta_txt = finals
|
482 |
+
|
483 |
+
def 把字符太少的块清除为回车(meta_txt):
|
484 |
+
for index, block_txt in enumerate(meta_txt):
|
485 |
+
if len(block_txt) < 100:
|
486 |
+
meta_txt[index] = '\n'
|
487 |
+
return meta_txt
|
488 |
+
meta_txt = 把字符太少的块清除为回车(meta_txt)
|
489 |
+
|
490 |
+
def 清理多余的空行(meta_txt):
|
491 |
+
for index in reversed(range(1, len(meta_txt))):
|
492 |
+
if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
|
493 |
+
meta_txt.pop(index)
|
494 |
+
return meta_txt
|
495 |
+
meta_txt = 清理多余的空行(meta_txt)
|
496 |
+
|
497 |
+
def 合并小写开头的段落块(meta_txt):
|
498 |
+
def starts_with_lowercase_word(s):
|
499 |
+
pattern = r"^[a-z]+"
|
500 |
+
match = re.match(pattern, s)
|
501 |
+
if match:
|
502 |
+
return True
|
503 |
+
else:
|
504 |
+
return False
|
505 |
+
for _ in range(100):
|
506 |
+
for index, block_txt in enumerate(meta_txt):
|
507 |
+
if starts_with_lowercase_word(block_txt):
|
508 |
+
if meta_txt[index-1] != '\n':
|
509 |
+
meta_txt[index-1] += ' '
|
510 |
+
else:
|
511 |
+
meta_txt[index-1] = ''
|
512 |
+
meta_txt[index-1] += meta_txt[index]
|
513 |
+
meta_txt[index] = '\n'
|
514 |
+
return meta_txt
|
515 |
+
meta_txt = 合并小写开头的段落块(meta_txt)
|
516 |
+
meta_txt = 清理多余的空行(meta_txt)
|
517 |
+
|
518 |
+
meta_txt = '\n'.join(meta_txt)
|
519 |
+
# 清除重复的换行
|
520 |
+
for _ in range(5):
|
521 |
+
meta_txt = meta_txt.replace('\n\n', '\n')
|
522 |
+
|
523 |
+
# 换行 -> 双换行
|
524 |
+
meta_txt = meta_txt.replace('\n', '\n\n')
|
525 |
+
|
526 |
+
for f in finals:
|
527 |
+
print亮黄(f)
|
528 |
+
print亮绿('***************************')
|
529 |
+
|
530 |
+
return meta_txt, page_one_meta
|
crazy_functions/批量翻译PDF文档_多线程.py
CHANGED
@@ -2,174 +2,9 @@ from toolbox import CatchException, report_execption, write_results_to_file
|
|
2 |
from toolbox import update_ui
|
3 |
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
4 |
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
|
|
|
5 |
from colorful import *
|
6 |
|
7 |
-
def read_and_clean_pdf_text(fp):
|
8 |
-
"""
|
9 |
-
这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好,不建议任何人去读这个函数
|
10 |
-
|
11 |
-
**输入参数说明**
|
12 |
-
- `fp`:需要读取和清理文本的pdf文件路径
|
13 |
-
|
14 |
-
**输出参数说明**
|
15 |
-
- `meta_txt`:清理后的文本内容字符串
|
16 |
-
- `page_one_meta`:第一页清理后的文本内容列表
|
17 |
-
|
18 |
-
**函数功能**
|
19 |
-
读取pdf文件并清理其中的文本内容,清理规则包括:
|
20 |
-
- 提取所有块元的文本信息,并合并为一个字符串
|
21 |
-
- 去除短块(字符数小于100)并替换为回车符
|
22 |
-
- 清理多余的空行
|
23 |
-
- 合并小写字母开头的段落块并替换为空格
|
24 |
-
- 清除重复的换行
|
25 |
-
- 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔
|
26 |
-
"""
|
27 |
-
import fitz, copy
|
28 |
-
import re
|
29 |
-
import numpy as np
|
30 |
-
fc = 0
|
31 |
-
fs = 1
|
32 |
-
fb = 2
|
33 |
-
REMOVE_FOOT_NOTE = True
|
34 |
-
REMOVE_FOOT_FFSIZE_PERCENT = 0.95
|
35 |
-
def primary_ffsize(l):
|
36 |
-
fsize_statiscs = {}
|
37 |
-
for wtf in l['spans']:
|
38 |
-
if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
|
39 |
-
fsize_statiscs[wtf['size']] += len(wtf['text'])
|
40 |
-
return max(fsize_statiscs, key=fsize_statiscs.get)
|
41 |
-
|
42 |
-
def ffsize_same(a,b):
|
43 |
-
return abs((a-b)/max(a,b)) < 0.02
|
44 |
-
# file_content = ""
|
45 |
-
with fitz.open(fp) as doc:
|
46 |
-
meta_txt = []
|
47 |
-
meta_font = []
|
48 |
-
|
49 |
-
meta_line = []
|
50 |
-
meta_span = []
|
51 |
-
for index, page in enumerate(doc):
|
52 |
-
# file_content += page.get_text()
|
53 |
-
text_areas = page.get_text("dict") # 获取页面上的文本信息
|
54 |
-
for t in text_areas['blocks']:
|
55 |
-
if 'lines' in t:
|
56 |
-
pf = 998
|
57 |
-
for l in t['lines']:
|
58 |
-
txt_line = "".join([wtf['text'] for wtf in l['spans']])
|
59 |
-
pf = primary_ffsize(l)
|
60 |
-
meta_line.append([txt_line, pf, l['bbox'], l])
|
61 |
-
for wtf in l['spans']: # for l in t['lines']:
|
62 |
-
meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
|
63 |
-
# meta_line.append(["NEW_BLOCK", pf])
|
64 |
-
# 块元提取 for each word segment with in line for each line cross-line words for each block
|
65 |
-
meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
|
66 |
-
'- ', '') for t in text_areas['blocks'] if 'lines' in t])
|
67 |
-
meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
|
68 |
-
for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
|
69 |
-
if index == 0:
|
70 |
-
page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
|
71 |
-
'- ', '') for t in text_areas['blocks'] if 'lines' in t]
|
72 |
-
# 获取正文主字体
|
73 |
-
fsize_statiscs = {}
|
74 |
-
for span in meta_span:
|
75 |
-
if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
|
76 |
-
fsize_statiscs[span[1]] += span[2]
|
77 |
-
main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
|
78 |
-
if REMOVE_FOOT_NOTE:
|
79 |
-
give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
|
80 |
-
|
81 |
-
# 切分和重新整合
|
82 |
-
mega_sec = []
|
83 |
-
sec = []
|
84 |
-
for index, line in enumerate(meta_line):
|
85 |
-
if index == 0:
|
86 |
-
sec.append(line[fc])
|
87 |
-
continue
|
88 |
-
if REMOVE_FOOT_NOTE:
|
89 |
-
if meta_line[index][fs] <= give_up_fize_threshold:
|
90 |
-
continue
|
91 |
-
if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
|
92 |
-
# 尝试识别段落
|
93 |
-
if meta_line[index][fc].endswith('.') and\
|
94 |
-
(meta_line[index-1][fc] != 'NEW_BLOCK') and \
|
95 |
-
(meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
|
96 |
-
sec[-1] += line[fc]
|
97 |
-
sec[-1] += "\n\n"
|
98 |
-
else:
|
99 |
-
sec[-1] += " "
|
100 |
-
sec[-1] += line[fc]
|
101 |
-
else:
|
102 |
-
if (index+1 < len(meta_line)) and \
|
103 |
-
meta_line[index][fs] > main_fsize:
|
104 |
-
# 单行 + 字体大
|
105 |
-
mega_sec.append(copy.deepcopy(sec))
|
106 |
-
sec = []
|
107 |
-
sec.append("# " + line[fc])
|
108 |
-
else:
|
109 |
-
# 尝试识别section
|
110 |
-
if meta_line[index-1][fs] > meta_line[index][fs]:
|
111 |
-
sec.append("\n" + line[fc])
|
112 |
-
else:
|
113 |
-
sec.append(line[fc])
|
114 |
-
mega_sec.append(copy.deepcopy(sec))
|
115 |
-
|
116 |
-
finals = []
|
117 |
-
for ms in mega_sec:
|
118 |
-
final = " ".join(ms)
|
119 |
-
final = final.replace('- ', ' ')
|
120 |
-
finals.append(final)
|
121 |
-
meta_txt = finals
|
122 |
-
|
123 |
-
def 把字符太少的块清除为回车(meta_txt):
|
124 |
-
for index, block_txt in enumerate(meta_txt):
|
125 |
-
if len(block_txt) < 100:
|
126 |
-
meta_txt[index] = '\n'
|
127 |
-
return meta_txt
|
128 |
-
meta_txt = 把字符太少的块清除为回车(meta_txt)
|
129 |
-
|
130 |
-
def 清理多余的空行(meta_txt):
|
131 |
-
for index in reversed(range(1, len(meta_txt))):
|
132 |
-
if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
|
133 |
-
meta_txt.pop(index)
|
134 |
-
return meta_txt
|
135 |
-
meta_txt = 清理多余的空行(meta_txt)
|
136 |
-
|
137 |
-
def 合并小写开头的段落块(meta_txt):
|
138 |
-
def starts_with_lowercase_word(s):
|
139 |
-
pattern = r"^[a-z]+"
|
140 |
-
match = re.match(pattern, s)
|
141 |
-
if match:
|
142 |
-
return True
|
143 |
-
else:
|
144 |
-
return False
|
145 |
-
for _ in range(100):
|
146 |
-
for index, block_txt in enumerate(meta_txt):
|
147 |
-
if starts_with_lowercase_word(block_txt):
|
148 |
-
if meta_txt[index-1] != '\n':
|
149 |
-
meta_txt[index-1] += ' '
|
150 |
-
else:
|
151 |
-
meta_txt[index-1] = ''
|
152 |
-
meta_txt[index-1] += meta_txt[index]
|
153 |
-
meta_txt[index] = '\n'
|
154 |
-
return meta_txt
|
155 |
-
meta_txt = 合并小写开头的段落块(meta_txt)
|
156 |
-
meta_txt = 清理多余的空行(meta_txt)
|
157 |
-
|
158 |
-
meta_txt = '\n'.join(meta_txt)
|
159 |
-
# 清除重复的换行
|
160 |
-
for _ in range(5):
|
161 |
-
meta_txt = meta_txt.replace('\n\n', '\n')
|
162 |
-
|
163 |
-
# 换行 -> 双换行
|
164 |
-
meta_txt = meta_txt.replace('\n', '\n\n')
|
165 |
-
|
166 |
-
for f in finals:
|
167 |
-
print亮黄(f)
|
168 |
-
print亮绿('***************************')
|
169 |
-
|
170 |
-
return meta_txt, page_one_meta
|
171 |
-
|
172 |
-
|
173 |
@CatchException
|
174 |
def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt, web_port):
|
175 |
import glob
|
|
|
2 |
from toolbox import update_ui
|
3 |
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
4 |
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
|
5 |
+
from .crazy_utils import read_and_clean_pdf_text
|
6 |
from colorful import *
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
@CatchException
|
9 |
def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt, web_port):
|
10 |
import glob
|
crazy_functions/理解PDF文档内容.py
CHANGED
@@ -1,142 +1,66 @@
|
|
1 |
from toolbox import update_ui
|
2 |
from toolbox import CatchException, report_execption
|
3 |
-
import
|
4 |
-
import unicodedata
|
5 |
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
6 |
fast_debug = False
|
7 |
|
8 |
-
def is_paragraph_break(match):
|
9 |
-
"""
|
10 |
-
根据给定的匹配结果来判断换行符是否表示段落分隔。
|
11 |
-
如果换行符前为句子结束标志(句号,感叹号,问号),且下一个字符为大写字母,则换行符更有可能表示段落分隔。
|
12 |
-
也可以根据之前的内容长度来判断段落是否已经足够长。
|
13 |
-
"""
|
14 |
-
prev_char, next_char = match.groups()
|
15 |
-
|
16 |
-
# 句子结束标志
|
17 |
-
sentence_endings = ".!?"
|
18 |
-
|
19 |
-
# 设定一个最小段落长度阈值
|
20 |
-
min_paragraph_length = 140
|
21 |
-
|
22 |
-
if prev_char in sentence_endings and next_char.isupper() and len(match.string[:match.start(1)]) > min_paragraph_length:
|
23 |
-
return "\n\n"
|
24 |
-
else:
|
25 |
-
return " "
|
26 |
-
|
27 |
-
def normalize_text(text):
|
28 |
-
"""
|
29 |
-
通过把连字(ligatures)等文本特殊符号转换为其基本形式来对文本进行归一化处理。
|
30 |
-
例如,将连字 "fi" 转换为 "f" 和 "i"。
|
31 |
-
"""
|
32 |
-
# 对文本进行归一化处理,分解连字
|
33 |
-
normalized_text = unicodedata.normalize("NFKD", text)
|
34 |
-
|
35 |
-
# 替换其他特殊字符
|
36 |
-
cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
|
37 |
-
|
38 |
-
return cleaned_text
|
39 |
-
|
40 |
-
def clean_text(raw_text):
|
41 |
-
"""
|
42 |
-
对从 PDF 提取出的原始文本进行清洗和格式化处理。
|
43 |
-
1. 对原始文本进行归一化处理。
|
44 |
-
2. 替换跨行的连词,例如 “Espe-\ncially” 转换为 “Especially”。
|
45 |
-
3. 根据 heuristic 规则判断换行符是否是段落分隔,并相应地进行替换。
|
46 |
-
"""
|
47 |
-
# 对文本进行归一化处理
|
48 |
-
normalized_text = normalize_text(raw_text)
|
49 |
-
|
50 |
-
# 替换跨行的连词
|
51 |
-
text = re.sub(r'(\w+-\n\w+)', lambda m: m.group(1).replace('-\n', ''), normalized_text)
|
52 |
-
|
53 |
-
# 根据前后相邻字符的特点,找到原文本中的换行符
|
54 |
-
newlines = re.compile(r'(\S)\n(\S)')
|
55 |
-
|
56 |
-
# 根据 heuristic 规则,用空格或段落分隔符替换原换行符
|
57 |
-
final_text = re.sub(newlines, lambda m: m.group(1) + is_paragraph_break(m) + m.group(2), text)
|
58 |
-
|
59 |
-
return final_text.strip()
|
60 |
|
61 |
def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
|
62 |
-
import
|
63 |
print('begin analysis on:', file_name)
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
import tkinter as tk
|
118 |
-
from tkinter import filedialog
|
119 |
-
|
120 |
-
root = tk.Tk()
|
121 |
-
root.withdraw()
|
122 |
-
txt = filedialog.askopenfilename()
|
123 |
-
|
124 |
-
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
125 |
-
try:
|
126 |
-
import fitz
|
127 |
-
except:
|
128 |
-
report_execption(chatbot, history,
|
129 |
-
a = f"解析项目: {txt}",
|
130 |
-
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。")
|
131 |
-
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
132 |
-
return
|
133 |
-
|
134 |
-
# 清空历史,以免输入溢出
|
135 |
-
history = []
|
136 |
-
|
137 |
-
# 开始正式执行任务
|
138 |
-
yield from 解析PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
|
139 |
-
|
140 |
|
141 |
|
142 |
@CatchException
|
@@ -146,7 +70,7 @@ def 理解PDF文档内容标准文件输入(txt, llm_kwargs, plugin_kwargs, chat
|
|
146 |
# 基本信息:功能、贡献者
|
147 |
chatbot.append([
|
148 |
"函数插件功能?",
|
149 |
-
"理解PDF论文内容,并且将结合上下文内容,进行学术解答。函数插件贡献者: Hanzoe
|
150 |
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
151 |
|
152 |
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
|
|
1 |
from toolbox import update_ui
|
2 |
from toolbox import CatchException, report_execption
|
3 |
+
from .crazy_utils import read_and_clean_pdf_text
|
|
|
4 |
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
5 |
fast_debug = False
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
|
9 |
+
import tiktoken
|
10 |
print('begin analysis on:', file_name)
|
11 |
+
file_content, page_one = read_and_clean_pdf_text(file_name)
|
12 |
+
|
13 |
+
# 递归地切割PDF文件,每一块(尽量是完整的一个section,比如introduction,experiment等,必要时再进行切割)
|
14 |
+
# 的长度必须小于 2500 个 Token
|
15 |
+
TOKEN_LIMIT_PER_FRAGMENT = 2500
|
16 |
+
|
17 |
+
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
18 |
+
from toolbox import get_conf
|
19 |
+
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
20 |
+
def get_token_num(txt): return len(enc.encode(txt))
|
21 |
+
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
22 |
+
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
|
23 |
+
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
24 |
+
txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
|
25 |
+
# 为了更好的效果,我们剥离Introduction之后的部分(如果有)
|
26 |
+
paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
|
27 |
+
|
28 |
+
############################## <第一步,从摘要中提取高价值信息,放到history中> ##################################
|
29 |
+
final_results = []
|
30 |
+
final_results.append(paper_meta)
|
31 |
+
|
32 |
+
############################## <第二步,迭代地历遍整个文章,提取精炼信息> ##################################
|
33 |
+
i_say_show_user = f'首先你在英文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。" # 用户提示
|
34 |
+
chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[]) # 更新UI
|
35 |
+
|
36 |
+
iteration_results = []
|
37 |
+
last_iteration_result = paper_meta # 初始值是摘要
|
38 |
+
MAX_WORD_TOTAL = 4096
|
39 |
+
n_fragment = len(paper_fragments)
|
40 |
+
if n_fragment >= 20: print('文章极长,不能达到预期效果')
|
41 |
+
for i in range(n_fragment):
|
42 |
+
NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
|
43 |
+
i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i]}"
|
44 |
+
i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]}"
|
45 |
+
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问
|
46 |
+
llm_kwargs, chatbot,
|
47 |
+
history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
|
48 |
+
sys_prompt="Extract the main idea of this section." # 提示
|
49 |
+
)
|
50 |
+
iteration_results.append(gpt_say)
|
51 |
+
last_iteration_result = gpt_say
|
52 |
+
|
53 |
+
############################## <第三步,整理history> ##################################
|
54 |
+
final_results.extend(iteration_results)
|
55 |
+
final_results.append(f'接下来,你是一名专业的学术教授,利用以上信息,使用中文回答我的问题。')
|
56 |
+
# 接下来两句话只显示在界面上,不起实际作用
|
57 |
+
i_say_show_user = f'接下来,你是一名专业的学术教授,利用以上信息,使用中文回答我的问题。'; gpt_say = "[Local Message] 收到。"
|
58 |
+
chatbot.append([i_say_show_user, gpt_say])
|
59 |
+
|
60 |
+
############################## <第四步,设置一个token上限,防止回答时Token溢出> ##################################
|
61 |
+
from .crazy_utils import input_clipping
|
62 |
+
_, final_results = input_clipping("", final_results, max_token_limit=3200)
|
63 |
+
yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
|
66 |
@CatchException
|
|
|
70 |
# 基本信息:功能、贡献者
|
71 |
chatbot.append([
|
72 |
"函数插件功能?",
|
73 |
+
"理解PDF论文内容,并且将结合上下文内容,进行学术解答。函数插件贡献者: Hanzoe, binary-husky"])
|
74 |
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
75 |
|
76 |
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
version
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"version": 2.
|
3 |
"show_feature": true,
|
4 |
-
"new_feature": "
|
5 |
}
|
|
|
1 |
{
|
2 |
+
"version": 2.68,
|
3 |
"show_feature": true,
|
4 |
+
"new_feature": "改善理解pdf(chatpdf)功能 <-> 如果一键更新失败,可前往github手动更新"
|
5 |
}
|