Spaces:
Sleeping
Sleeping
Duplicate from ShiwenNi/ChatReviewer
Browse filesCo-authored-by: ShiwenNi <[email protected]>
- .gitattributes +34 -0
- README.md +14 -0
- app.py +209 -0
- get_paper_from_pdf.py +193 -0
- requirements.txt +10 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: ChatReviewer
|
3 |
+
emoji: 💩
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: pink
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.22.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
duplicated_from: ShiwenNi/ChatReviewer
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import jieba
|
5 |
+
from io import BytesIO
|
6 |
+
import datetime
|
7 |
+
import time
|
8 |
+
import openai, tenacity
|
9 |
+
import argparse
|
10 |
+
import configparser
|
11 |
+
import json
|
12 |
+
import tiktoken
|
13 |
+
import PyPDF2
|
14 |
+
import gradio
|
15 |
+
|
16 |
+
|
17 |
+
def contains_chinese(text):
|
18 |
+
for ch in text:
|
19 |
+
if u'\u4e00' <= ch <= u'\u9fff':
|
20 |
+
return True
|
21 |
+
return False
|
22 |
+
|
23 |
+
def insert_sentence(text, sentence, interval):
|
24 |
+
lines = text.split('\n')
|
25 |
+
new_lines = []
|
26 |
+
|
27 |
+
for line in lines:
|
28 |
+
if contains_chinese(line):
|
29 |
+
words = list(jieba.cut(line))
|
30 |
+
separator = ''
|
31 |
+
else:
|
32 |
+
words = line.split()
|
33 |
+
separator = ' '
|
34 |
+
|
35 |
+
new_words = []
|
36 |
+
count = 0
|
37 |
+
|
38 |
+
for word in words:
|
39 |
+
new_words.append(word)
|
40 |
+
count += 1
|
41 |
+
|
42 |
+
if count % interval == 0:
|
43 |
+
new_words.append(sentence)
|
44 |
+
|
45 |
+
new_lines.append(separator.join(new_words))
|
46 |
+
|
47 |
+
return '\n'.join(new_lines)
|
48 |
+
|
49 |
+
# 定义Reviewer类
|
50 |
+
class Reviewer:
|
51 |
+
# 初始化方法,设置属性
|
52 |
+
def __init__(self, api, review_format, paper_pdf, language):
|
53 |
+
self.api = api
|
54 |
+
self.review_format = review_format
|
55 |
+
|
56 |
+
self.language = language
|
57 |
+
self.paper_pdf = paper_pdf
|
58 |
+
self.max_token_num = 4097
|
59 |
+
self.encoding = tiktoken.get_encoding("gpt2")
|
60 |
+
|
61 |
+
|
62 |
+
def review_by_chatgpt(self, paper_list):
|
63 |
+
text = self.extract_chapter(self.paper_pdf)
|
64 |
+
chat_review_text, total_token_used = self.chat_review(text=text)
|
65 |
+
return chat_review_text, total_token_used
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
70 |
+
stop=tenacity.stop_after_attempt(5),
|
71 |
+
reraise=True)
|
72 |
+
def chat_review(self, text):
|
73 |
+
openai.api_key = self.api # 读取api
|
74 |
+
review_prompt_token = 1000
|
75 |
+
text_token = len(self.encoding.encode(text))
|
76 |
+
input_text_index = int(len(text)*(self.max_token_num-review_prompt_token)/(text_token+1))
|
77 |
+
input_text = "This is the paper for your review:" + text[:input_text_index]
|
78 |
+
messages=[
|
79 |
+
{"role": "system", "content": "You are a professional reviewer. Now I will give you a paper. You need to give a complete review opinion according to the following requirements and format:"+ self.review_format +" Must be output in {}.".format(self.language)},
|
80 |
+
{"role": "user", "content": input_text},
|
81 |
+
]
|
82 |
+
|
83 |
+
response = openai.ChatCompletion.create(
|
84 |
+
model="gpt-3.5-turbo",
|
85 |
+
messages=messages,
|
86 |
+
)
|
87 |
+
result = ''
|
88 |
+
for choice in response.choices:
|
89 |
+
result += choice.message.content
|
90 |
+
result = insert_sentence(result, '**Generated by ChatGPT, no copying allowed!**', 25)
|
91 |
+
result += "\n\n⚠伦理声明/Ethics statement:\n--禁止直接复制生成的评论用于任何论文审稿工作!\n--Direct copying of generated comments for any paper review work is prohibited!"
|
92 |
+
print("********"*10)
|
93 |
+
print(result)
|
94 |
+
print("********"*10)
|
95 |
+
print("prompt_token_used:", response.usage.prompt_tokens)
|
96 |
+
print("completion_token_used:", response.usage.completion_tokens)
|
97 |
+
print("total_token_used:", response.usage.total_tokens)
|
98 |
+
print("response_time:", response.response_ms/1000.0, 's')
|
99 |
+
|
100 |
+
return result, response.usage.total_tokens
|
101 |
+
|
102 |
+
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
def extract_chapter(self, pdf_path):
|
107 |
+
file_object = BytesIO(pdf_path)
|
108 |
+
pdf_reader = PyPDF2.PdfReader(file_object)
|
109 |
+
# 获取PDF的总页数
|
110 |
+
num_pages = len(pdf_reader.pages)
|
111 |
+
# 初始化提取状态和提取文本
|
112 |
+
extraction_started = False
|
113 |
+
extracted_text = ""
|
114 |
+
# 遍历PDF中的每一页
|
115 |
+
for page_number in range(num_pages):
|
116 |
+
page = pdf_reader.pages[page_number]
|
117 |
+
page_text = page.extract_text()
|
118 |
+
|
119 |
+
# 如果找到了章节标题,开始提取
|
120 |
+
if 'Abstract'.lower() in page_text.lower() and not extraction_started:
|
121 |
+
extraction_started = True
|
122 |
+
page_number_start = page_number
|
123 |
+
# 如果提取已开始,将页面文本添加到提取文本中
|
124 |
+
if extraction_started:
|
125 |
+
extracted_text += page_text
|
126 |
+
# 如果找到下一章节标题,停止提取
|
127 |
+
if page_number_start + 1 < page_number:
|
128 |
+
break
|
129 |
+
return extracted_text
|
130 |
+
|
131 |
+
def main(api, review_format, paper_pdf, language):
|
132 |
+
start_time = time.time()
|
133 |
+
if not api or not review_format or not paper_pdf:
|
134 |
+
return "请输入完整内容!"
|
135 |
+
# 判断PDF文件
|
136 |
+
else:
|
137 |
+
# 创建一个Reader对象
|
138 |
+
reviewer1 = Reviewer(api, review_format, paper_pdf, language)
|
139 |
+
# 开始判断是路径还是文件:
|
140 |
+
comments, total_token_used = reviewer1.review_by_chatgpt(paper_list=paper_pdf)
|
141 |
+
time_used = time.time() - start_time
|
142 |
+
output2 ="使用token数:"+ str(total_token_used)+"\n花费时间:"+ str(round(time_used, 2)) +"秒"
|
143 |
+
return comments, output2
|
144 |
+
|
145 |
+
|
146 |
+
|
147 |
+
########################################################################################################
|
148 |
+
# 标题
|
149 |
+
title = "🤖ChatReviewer🤖"
|
150 |
+
# 描述
|
151 |
+
|
152 |
+
description = '''<div align='left'>
|
153 |
+
|
154 |
+
<strong>ChatReviewer是一款基于ChatGPT-3.5的API开发的智能论文分析与建议助手。</strong>其用途如下:
|
155 |
+
|
156 |
+
⭐️对论文的优缺点进行快速总结和分析,提高科研人员的文献阅读和理解的效率,紧跟研究前沿。
|
157 |
+
|
158 |
+
⭐️对自己的论文进行分析,根据ChatReviewer生成的改进建议进行查漏补缺,进一步提高自己的论文质量。
|
159 |
+
|
160 |
+
如果觉得很卡,可以点击右上角的Duplicate this Space,把ChatReviewer复制到你自己的Space中!(🈲:禁止直接复制生成的评论用于任何论文审稿工作!)
|
161 |
+
|
162 |
+
本项目的[Github](https://github.com/nishiwen1214/ChatReviewer),欢迎Star和Fork,也欢迎大佬赞助让本项目快速成长!💗([获取Api Key](https://chatgpt.cn.obiscr.com/blog/posts/2023/How-to-get-api-key/))
|
163 |
+
</div>
|
164 |
+
'''
|
165 |
+
|
166 |
+
# 创建Gradio界面
|
167 |
+
inp = [gradio.inputs.Textbox(label="请输入你的API-key(sk开头的字符串)",
|
168 |
+
default="",
|
169 |
+
type='password'),
|
170 |
+
gradio.inputs.Textbox(lines=5,
|
171 |
+
label="请输入特定的分析要求和格式(否则为默认格式)",
|
172 |
+
default="""* Overall Review
|
173 |
+
Please briefly summarize the main points and contributions of this paper.
|
174 |
+
xxx
|
175 |
+
* Paper Strength
|
176 |
+
Please provide a list of the strengths of this paper, including but not limited to: innovative and practical methodology, insightful empirical findings or in-depth theoretical analysis,
|
177 |
+
well-structured review of relevant literature, and any other factors that may make the paper valuable to readers. (Maximum length: 2,000 characters)
|
178 |
+
(1) xxx
|
179 |
+
(2) xxx
|
180 |
+
(3) xxx
|
181 |
+
* Paper Weakness
|
182 |
+
Please provide a numbered list of your main concerns regarding this paper (so authors could respond to the concerns individually).
|
183 |
+
These may include, but are not limited to: inadequate implementation details for reproducing the study, limited evaluation and ablation studies for the proposed method,
|
184 |
+
correctness of the theoretical analysis or experimental results, lack of comparisons or discussions with widely-known baselines in the field, lack of clarity in exposition,
|
185 |
+
or any other factors that may impede the reader's understanding or benefit from the paper. Please kindly refrain from providing a general assessment of the paper's novelty without providing detailed explanations. (Maximum length: 2,000 characters)
|
186 |
+
(1) xxx
|
187 |
+
(2) xxx
|
188 |
+
(3) xxx
|
189 |
+
* Questions To Authors And Suggestions For Rebuttal
|
190 |
+
Please provide a numbered list of specific and clear questions that pertain to the details of the proposed method, evaluation setting, or additional results that would aid in supporting the authors' claims.
|
191 |
+
The questions should be formulated in a manner that, after the authors have answered them during the rebuttal, it would enable a more thorough assessment of the paper's quality. (Maximum length: 2,000 characters)
|
192 |
+
*Overall score (1-10)
|
193 |
+
The paper is scored on a scale of 1-10, with 10 being the full mark, and 6 stands for borderline accept. Then give the reason for your rating.
|
194 |
+
xxx"""
|
195 |
+
),
|
196 |
+
gradio.inputs.File(label="请上传论文PDF(必填)",type="bytes"),
|
197 |
+
gradio.inputs.Radio(choices=["English", "Chinese"],
|
198 |
+
default="English",
|
199 |
+
label="选择输出语言"),
|
200 |
+
]
|
201 |
+
|
202 |
+
chat_reviewer_gui = gradio.Interface(fn=main,
|
203 |
+
inputs=inp,
|
204 |
+
outputs = [gradio.Textbox(lines=25, label="分析结果"), gradio.Textbox(lines=2, label="资源统计")],
|
205 |
+
title=title,
|
206 |
+
description=description)
|
207 |
+
|
208 |
+
# Start server
|
209 |
+
chat_reviewer_gui .launch(quiet=True, show_api=False)
|
get_paper_from_pdf.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz, io, os
|
2 |
+
from PIL import Image
|
3 |
+
from collections import Counter
|
4 |
+
import json
|
5 |
+
import re
|
6 |
+
|
7 |
+
class Paper:
|
8 |
+
def __init__(self, path, title='', url='', abs='', authors=[]):
|
9 |
+
# 初始化函数,根据pdf路径初始化Paper对象
|
10 |
+
self.url = url # 文章链接
|
11 |
+
self.path = path # pdf路径
|
12 |
+
self.section_names = [] # 段落标题
|
13 |
+
self.section_texts = {} # 段落内容
|
14 |
+
self.abs = abs
|
15 |
+
self.title_page = 0
|
16 |
+
if title == '':
|
17 |
+
self.pdf = fitz.open(self.path) # pdf文档
|
18 |
+
self.title = self.get_title()
|
19 |
+
self.parse_pdf()
|
20 |
+
else:
|
21 |
+
self.title = title
|
22 |
+
self.authors = authors
|
23 |
+
self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
|
24 |
+
self.digit_num = [str(d + 1) for d in range(10)]
|
25 |
+
self.first_image = ''
|
26 |
+
|
27 |
+
def parse_pdf(self):
|
28 |
+
self.pdf = fitz.open(self.path) # pdf文档
|
29 |
+
self.text_list = [page.get_text() for page in self.pdf]
|
30 |
+
self.all_text = ' '.join(self.text_list)
|
31 |
+
self.extract_section_infomation()
|
32 |
+
self.section_texts.update({"title": self.title})
|
33 |
+
self.pdf.close()
|
34 |
+
|
35 |
+
# 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
|
36 |
+
def get_chapter_names(self, ):
|
37 |
+
# # 打开一个pdf文件
|
38 |
+
doc = fitz.open(self.path) # pdf文档
|
39 |
+
text_list = [page.get_text() for page in doc]
|
40 |
+
all_text = ''
|
41 |
+
for text in text_list:
|
42 |
+
all_text += text
|
43 |
+
# # 创建一个空列表,用于存储章节名称
|
44 |
+
chapter_names = []
|
45 |
+
for line in all_text.split('\n'):
|
46 |
+
line_list = line.split(' ')
|
47 |
+
if '.' in line:
|
48 |
+
point_split_list = line.split('.')
|
49 |
+
space_split_list = line.split(' ')
|
50 |
+
if 1 < len(space_split_list) < 5:
|
51 |
+
if 1 < len(point_split_list) < 5 and (
|
52 |
+
point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
|
53 |
+
# print("line:", line)
|
54 |
+
chapter_names.append(line)
|
55 |
+
|
56 |
+
return chapter_names
|
57 |
+
|
58 |
+
def get_title(self):
|
59 |
+
doc = self.pdf # 打开pdf文件
|
60 |
+
max_font_size = 0 # 初始化最大字体大小为0
|
61 |
+
max_string = "" # 初始化最大字体大小对应的字符串为空
|
62 |
+
max_font_sizes = [0]
|
63 |
+
for page_index, page in enumerate(doc): # 遍历每一页
|
64 |
+
text = page.get_text("dict") # 获取页面上的文本信息
|
65 |
+
blocks = text["blocks"] # 获取文本块列表
|
66 |
+
for block in blocks: # 遍历每个文本块
|
67 |
+
if block["type"] == 0 and len(block['lines']): # 如果是文字类型
|
68 |
+
if len(block["lines"][0]["spans"]):
|
69 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
70 |
+
max_font_sizes.append(font_size)
|
71 |
+
if font_size > max_font_size: # 如果字体大小大于当前最大值
|
72 |
+
max_font_size = font_size # 更新最大值
|
73 |
+
max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
74 |
+
max_font_sizes.sort()
|
75 |
+
# print("max_font_sizes", max_font_sizes[-10:])
|
76 |
+
cur_title = ''
|
77 |
+
for page_index, page in enumerate(doc): # 遍历每一页
|
78 |
+
text = page.get_text("dict") # 获取页面上的文本信息
|
79 |
+
blocks = text["blocks"] # 获取文本块列表
|
80 |
+
for block in blocks: # 遍历每个文本块
|
81 |
+
if block["type"] == 0 and len(block['lines']): # 如果是文字类型
|
82 |
+
if len(block["lines"][0]["spans"]):
|
83 |
+
cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
84 |
+
font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
|
85 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
86 |
+
# print(font_size)
|
87 |
+
if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
|
88 |
+
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
89 |
+
if len(cur_string) > 4 and "arXiv" not in cur_string:
|
90 |
+
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
91 |
+
if cur_title == '':
|
92 |
+
cur_title += cur_string
|
93 |
+
else:
|
94 |
+
cur_title += ' ' + cur_string
|
95 |
+
self.title_page = page_index
|
96 |
+
# break
|
97 |
+
title = cur_title.replace('\n', ' ')
|
98 |
+
return title
|
99 |
+
|
100 |
+
def extract_section_infomation(self):
|
101 |
+
doc = fitz.open(self.path)
|
102 |
+
|
103 |
+
# 获取文档中所有字体大小
|
104 |
+
font_sizes = []
|
105 |
+
for page in doc:
|
106 |
+
blocks = page.get_text("dict")["blocks"]
|
107 |
+
for block in blocks:
|
108 |
+
if 'lines' not in block:
|
109 |
+
continue
|
110 |
+
lines = block["lines"]
|
111 |
+
for line in lines:
|
112 |
+
for span in line["spans"]:
|
113 |
+
font_sizes.append(span["size"])
|
114 |
+
most_common_size, _ = Counter(font_sizes).most_common(1)[0]
|
115 |
+
|
116 |
+
# 按照最频繁的字体大小确定标题字体大小的阈值
|
117 |
+
threshold = most_common_size * 1
|
118 |
+
|
119 |
+
section_dict = {}
|
120 |
+
last_heading = None
|
121 |
+
subheadings = []
|
122 |
+
heading_font = -1
|
123 |
+
# 遍历每一页并查找子标题
|
124 |
+
found_abstract = False
|
125 |
+
upper_heading = False
|
126 |
+
font_heading = False
|
127 |
+
for page in doc:
|
128 |
+
blocks = page.get_text("dict")["blocks"]
|
129 |
+
for block in blocks:
|
130 |
+
if not found_abstract:
|
131 |
+
try:
|
132 |
+
text = json.dumps(block)
|
133 |
+
except:
|
134 |
+
continue
|
135 |
+
if re.search(r"\bAbstract\b", text, re.IGNORECASE):
|
136 |
+
found_abstract = True
|
137 |
+
last_heading = "Abstract"
|
138 |
+
section_dict["Abstract"] = ""
|
139 |
+
if found_abstract:
|
140 |
+
if 'lines' not in block:
|
141 |
+
continue
|
142 |
+
lines = block["lines"]
|
143 |
+
for line in lines:
|
144 |
+
for span in line["spans"]:
|
145 |
+
# 如果当前文本是子标题
|
146 |
+
if not font_heading and span["text"].isupper() and sum(1 for c in span["text"] if c.isupper() and ('A' <= c <='Z')) > 4: # 针对一些标题大小一样,但是全大写的论文
|
147 |
+
upper_heading = True
|
148 |
+
heading = span["text"].strip()
|
149 |
+
if "References" in heading: # reference 以后的内容不考虑
|
150 |
+
self.section_names = subheadings
|
151 |
+
self.section_texts = section_dict
|
152 |
+
return
|
153 |
+
subheadings.append(heading)
|
154 |
+
if last_heading is not None:
|
155 |
+
section_dict[last_heading] = section_dict[last_heading].strip()
|
156 |
+
section_dict[heading] = ""
|
157 |
+
last_heading = heading
|
158 |
+
if not upper_heading and span["size"] > threshold and re.match( # 正常情况下,通过字体大小判断
|
159 |
+
r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
|
160 |
+
span["text"].strip()):
|
161 |
+
font_heading = True
|
162 |
+
if heading_font == -1:
|
163 |
+
heading_font = span["size"]
|
164 |
+
elif heading_font != span["size"]:
|
165 |
+
continue
|
166 |
+
heading = span["text"].strip()
|
167 |
+
if "References" in heading: # reference 以后的内容不考虑
|
168 |
+
self.section_names = subheadings
|
169 |
+
self.section_texts = section_dict
|
170 |
+
return
|
171 |
+
subheadings.append(heading)
|
172 |
+
if last_heading is not None:
|
173 |
+
section_dict[last_heading] = section_dict[last_heading].strip()
|
174 |
+
section_dict[heading] = ""
|
175 |
+
last_heading = heading
|
176 |
+
# 否则将当前文本添加到上一个子标题的文本中
|
177 |
+
elif last_heading is not None:
|
178 |
+
section_dict[last_heading] += " " + span["text"].strip()
|
179 |
+
self.section_names = subheadings
|
180 |
+
self.section_texts = section_dict
|
181 |
+
|
182 |
+
|
183 |
+
def main():
|
184 |
+
path = r'demo.pdf'
|
185 |
+
paper = Paper(path=path)
|
186 |
+
paper.parse_pdf()
|
187 |
+
# for key, value in paper.section_text_dict.items():
|
188 |
+
# print(key, value)
|
189 |
+
# print("*"*40)
|
190 |
+
|
191 |
+
|
192 |
+
if __name__ == '__main__':
|
193 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PyMuPDF==1.21.1
|
2 |
+
jieba
|
3 |
+
tiktoken==0.2.0
|
4 |
+
tenacity==8.2.2
|
5 |
+
pybase64==1.2.3
|
6 |
+
Pillow==9.4.0
|
7 |
+
openai==0.27.0
|
8 |
+
markdown
|
9 |
+
gradio==3.20.1
|
10 |
+
PyPDF2
|