Spaces:
Running
Running
新的arxiv论文插件
Browse files- crazy_functions/下载arxiv论文翻译摘要.py +187 -0
crazy_functions/下载arxiv论文翻译摘要.py
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from predict import predict_no_ui
|
2 |
+
from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down, get_conf
|
3 |
+
import re, requests, unicodedata, os
|
4 |
+
|
5 |
+
def download_arxiv_(url_pdf):
|
6 |
+
if 'arxiv.org' not in url_pdf:
|
7 |
+
if ('.' in url_pdf) and ('/' not in url_pdf):
|
8 |
+
new_url = 'https://arxiv.org/abs/'+url_pdf
|
9 |
+
print('下载编号:', url_pdf, '自动定位:', new_url)
|
10 |
+
# download_arxiv_(new_url)
|
11 |
+
return download_arxiv_(new_url)
|
12 |
+
else:
|
13 |
+
print('不能识别的URL!')
|
14 |
+
return None
|
15 |
+
if 'abs' in url_pdf:
|
16 |
+
url_pdf = url_pdf.replace('abs', 'pdf')
|
17 |
+
url_pdf = url_pdf + '.pdf'
|
18 |
+
|
19 |
+
url_abs = url_pdf.replace('.pdf', '').replace('pdf', 'abs')
|
20 |
+
title, other_info = get_name(_url_=url_abs)
|
21 |
+
|
22 |
+
paper_id = title.split()[0] # '[1712.00559]'
|
23 |
+
if '2' in other_info['year']:
|
24 |
+
title = other_info['year'] + ' ' + title
|
25 |
+
|
26 |
+
known_conf = ['NeurIPS', 'NIPS', 'Nature', 'Science', 'ICLR', 'AAAI']
|
27 |
+
for k in known_conf:
|
28 |
+
if k in other_info['comment']:
|
29 |
+
title = k + ' ' + title
|
30 |
+
|
31 |
+
download_dir = './gpt_log/arxiv/'
|
32 |
+
os.makedirs(download_dir, exist_ok=True)
|
33 |
+
|
34 |
+
title_str = title.replace('?', '?')\
|
35 |
+
.replace(':', ':')\
|
36 |
+
.replace('\"', '“')\
|
37 |
+
.replace('\n', '')\
|
38 |
+
.replace(' ', ' ')\
|
39 |
+
.replace(' ', ' ')
|
40 |
+
|
41 |
+
requests_pdf_url = url_pdf
|
42 |
+
file_path = download_dir+title_str
|
43 |
+
# if os.path.exists(file_path):
|
44 |
+
# print('返回缓存文件')
|
45 |
+
# return './gpt_log/arxiv/'+title_str
|
46 |
+
|
47 |
+
print('下载中')
|
48 |
+
proxies, = get_conf('proxies')
|
49 |
+
r = requests.get(requests_pdf_url, proxies=proxies)
|
50 |
+
with open(file_path, 'wb+') as f:
|
51 |
+
f.write(r.content)
|
52 |
+
print('下载完成')
|
53 |
+
|
54 |
+
# print('输出下载命令:','aria2c -o \"%s\" %s'%(title_str,url_pdf))
|
55 |
+
# subprocess.call('aria2c --all-proxy=\"172.18.116.150:11084\" -o \"%s\" %s'%(download_dir+title_str,url_pdf), shell=True)
|
56 |
+
|
57 |
+
x = "%s %s %s.bib" % (paper_id, other_info['year'], other_info['authors'])
|
58 |
+
x = x.replace('?', '?')\
|
59 |
+
.replace(':', ':')\
|
60 |
+
.replace('\"', '“')\
|
61 |
+
.replace('\n', '')\
|
62 |
+
.replace(' ', ' ')\
|
63 |
+
.replace(' ', ' ')
|
64 |
+
return './gpt_log/arxiv/'+title_str, other_info
|
65 |
+
|
66 |
+
|
67 |
+
def get_name(_url_):
|
68 |
+
import os
|
69 |
+
from bs4 import BeautifulSoup
|
70 |
+
print('正在获取文献名!')
|
71 |
+
print(_url_)
|
72 |
+
|
73 |
+
# arxiv_recall = {}
|
74 |
+
# if os.path.exists('./arxiv_recall.pkl'):
|
75 |
+
# with open('./arxiv_recall.pkl', 'rb') as f:
|
76 |
+
# arxiv_recall = pickle.load(f)
|
77 |
+
|
78 |
+
# if _url_ in arxiv_recall:
|
79 |
+
# print('在缓存中')
|
80 |
+
# return arxiv_recall[_url_]
|
81 |
+
|
82 |
+
proxies, = get_conf('proxies')
|
83 |
+
res = requests.get(_url_, proxies=proxies)
|
84 |
+
|
85 |
+
bs = BeautifulSoup(res.text, 'html.parser')
|
86 |
+
other_details = {}
|
87 |
+
|
88 |
+
# get year
|
89 |
+
try:
|
90 |
+
year = bs.find_all(class_='dateline')[0].text
|
91 |
+
year = re.search(r'(\d{4})', year, re.M | re.I).group(1)
|
92 |
+
other_details['year'] = year
|
93 |
+
abstract = bs.find_all(class_='abstract mathjax')[0].text
|
94 |
+
other_details['abstract'] = abstract
|
95 |
+
except:
|
96 |
+
other_details['year'] = ''
|
97 |
+
print('年份获取失败')
|
98 |
+
|
99 |
+
# get author
|
100 |
+
try:
|
101 |
+
authors = bs.find_all(class_='authors')[0].text
|
102 |
+
authors = authors.split('Authors:')[1]
|
103 |
+
other_details['authors'] = authors
|
104 |
+
except:
|
105 |
+
other_details['authors'] = ''
|
106 |
+
print('authors获取失败')
|
107 |
+
|
108 |
+
# get comment
|
109 |
+
try:
|
110 |
+
comment = bs.find_all(class_='metatable')[0].text
|
111 |
+
real_comment = None
|
112 |
+
for item in comment.replace('\n', ' ').split(' '):
|
113 |
+
if 'Comments' in item:
|
114 |
+
real_comment = item
|
115 |
+
if real_comment is not None:
|
116 |
+
other_details['comment'] = real_comment
|
117 |
+
else:
|
118 |
+
other_details['comment'] = ''
|
119 |
+
except:
|
120 |
+
other_details['comment'] = ''
|
121 |
+
print('年份获取失败')
|
122 |
+
|
123 |
+
title_str = BeautifulSoup(
|
124 |
+
res.text, 'html.parser').find('title').contents[0]
|
125 |
+
print('获取成功:', title_str)
|
126 |
+
# arxiv_recall[_url_] = (title_str+'.pdf', other_details)
|
127 |
+
# with open('./arxiv_recall.pkl', 'wb') as f:
|
128 |
+
# pickle.dump(arxiv_recall, f)
|
129 |
+
|
130 |
+
return title_str+'.pdf', other_details
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
@CatchException
|
135 |
+
def 下载arxiv论文并翻译摘要(txt, top_p, temperature, chatbot, history, systemPromptTxt, WEB_PORT):
|
136 |
+
|
137 |
+
CRAZY_FUNCTION_INFO = "下载arxiv论文并翻译摘要,作者 binary-husky。正在提取摘要并下载PDF文档……"
|
138 |
+
raise RuntimeError()
|
139 |
+
import glob
|
140 |
+
import os
|
141 |
+
|
142 |
+
# 基本信息:功能、贡献者
|
143 |
+
chatbot.append(["函数插件功能?", CRAZY_FUNCTION_INFO])
|
144 |
+
yield chatbot, history, '正常'
|
145 |
+
|
146 |
+
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
147 |
+
try:
|
148 |
+
import pdfminer, bs4
|
149 |
+
except:
|
150 |
+
report_execption(chatbot, history,
|
151 |
+
a = f"解析项目: {txt}",
|
152 |
+
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pdfminer beautifulsoup4```。")
|
153 |
+
yield chatbot, history, '正常'
|
154 |
+
return
|
155 |
+
|
156 |
+
# 清空历史,以免输入溢出
|
157 |
+
history = []
|
158 |
+
|
159 |
+
# 提取摘要,下载PDF文档
|
160 |
+
try:
|
161 |
+
pdf_path, info = download_arxiv_(txt)
|
162 |
+
except:
|
163 |
+
report_execption(chatbot, history,
|
164 |
+
a = f"解析项目: {txt}",
|
165 |
+
b = f"下载pdf文件未成功")
|
166 |
+
yield chatbot, history, '正常'
|
167 |
+
return
|
168 |
+
|
169 |
+
# 翻译摘要等
|
170 |
+
i_say = f"请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。材料如下:{str(info)}"
|
171 |
+
i_say_show_user = f'请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。论文:{pdf_path}'
|
172 |
+
chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
|
173 |
+
yield chatbot, history, '正常'
|
174 |
+
msg = '正常'
|
175 |
+
# ** gpt request **
|
176 |
+
gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[]) # 带超时倒计时
|
177 |
+
chatbot[-1] = (i_say_show_user, gpt_say)
|
178 |
+
history.append(i_say_show_user); history.append(gpt_say)
|
179 |
+
yield chatbot, history, msg
|
180 |
+
# 写入文件
|
181 |
+
import shutil
|
182 |
+
# 重置文件的创建时间
|
183 |
+
shutil.copyfile(pdf_path, pdf_path.replace('.pdf', '.autodownload.pdf')); os.remove(pdf_path)
|
184 |
+
res = write_results_to_file(history)
|
185 |
+
chatbot.append(("完成了吗?", res))
|
186 |
+
yield chatbot, history, msg
|
187 |
+
|