|
from toolbox import update_ui |
|
from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down, get_conf |
|
import re, requests, unicodedata, os |
|
|
|
def download_arxiv_(url_pdf): |
|
if 'arxiv.org' not in url_pdf: |
|
if ('.' in url_pdf) and ('/' not in url_pdf): |
|
new_url = 'https://arxiv.org/abs/'+url_pdf |
|
print('下载编号:', url_pdf, '自动定位:', new_url) |
|
|
|
return download_arxiv_(new_url) |
|
else: |
|
print('不能识别的URL!') |
|
return None |
|
if 'abs' in url_pdf: |
|
url_pdf = url_pdf.replace('abs', 'pdf') |
|
url_pdf = url_pdf + '.pdf' |
|
|
|
url_abs = url_pdf.replace('.pdf', '').replace('pdf', 'abs') |
|
title, other_info = get_name(_url_=url_abs) |
|
|
|
paper_id = title.split()[0] |
|
if '2' in other_info['year']: |
|
title = other_info['year'] + ' ' + title |
|
|
|
known_conf = ['NeurIPS', 'NIPS', 'Nature', 'Science', 'ICLR', 'AAAI'] |
|
for k in known_conf: |
|
if k in other_info['comment']: |
|
title = k + ' ' + title |
|
|
|
download_dir = './gpt_log/arxiv/' |
|
os.makedirs(download_dir, exist_ok=True) |
|
|
|
title_str = title.replace('?', '?')\ |
|
.replace(':', ':')\ |
|
.replace('\"', '“')\ |
|
.replace('\n', '')\ |
|
.replace(' ', ' ')\ |
|
.replace(' ', ' ') |
|
|
|
requests_pdf_url = url_pdf |
|
file_path = download_dir+title_str |
|
|
|
|
|
|
|
|
|
print('下载中') |
|
proxies, = get_conf('proxies') |
|
r = requests.get(requests_pdf_url, proxies=proxies) |
|
with open(file_path, 'wb+') as f: |
|
f.write(r.content) |
|
print('下载完成') |
|
|
|
|
|
|
|
|
|
x = "%s %s %s.bib" % (paper_id, other_info['year'], other_info['authors']) |
|
x = x.replace('?', '?')\ |
|
.replace(':', ':')\ |
|
.replace('\"', '“')\ |
|
.replace('\n', '')\ |
|
.replace(' ', ' ')\ |
|
.replace(' ', ' ') |
|
return './gpt_log/arxiv/'+title_str, other_info |
|
|
|
|
|
def get_name(_url_): |
|
import os |
|
from bs4 import BeautifulSoup |
|
print('正在获取文献名!') |
|
print(_url_) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
proxies, = get_conf('proxies') |
|
res = requests.get(_url_, proxies=proxies) |
|
|
|
bs = BeautifulSoup(res.text, 'html.parser') |
|
other_details = {} |
|
|
|
|
|
try: |
|
year = bs.find_all(class_='dateline')[0].text |
|
year = re.search(r'(\d{4})', year, re.M | re.I).group(1) |
|
other_details['year'] = year |
|
abstract = bs.find_all(class_='abstract mathjax')[0].text |
|
other_details['abstract'] = abstract |
|
except: |
|
other_details['year'] = '' |
|
print('年份获取失败') |
|
|
|
|
|
try: |
|
authors = bs.find_all(class_='authors')[0].text |
|
authors = authors.split('Authors:')[1] |
|
other_details['authors'] = authors |
|
except: |
|
other_details['authors'] = '' |
|
print('authors获取失败') |
|
|
|
|
|
try: |
|
comment = bs.find_all(class_='metatable')[0].text |
|
real_comment = None |
|
for item in comment.replace('\n', ' ').split(' '): |
|
if 'Comments' in item: |
|
real_comment = item |
|
if real_comment is not None: |
|
other_details['comment'] = real_comment |
|
else: |
|
other_details['comment'] = '' |
|
except: |
|
other_details['comment'] = '' |
|
print('年份获取失败') |
|
|
|
title_str = BeautifulSoup( |
|
res.text, 'html.parser').find('title').contents[0] |
|
print('获取成功:', title_str) |
|
|
|
|
|
|
|
|
|
return title_str+'.pdf', other_details |
|
|
|
|
|
|
|
@CatchException |
|
def 下载arxiv论文并翻译摘要(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): |
|
|
|
CRAZY_FUNCTION_INFO = "下载arxiv论文并翻译摘要,函数插件作者[binary-husky]。正在提取摘要并下载PDF文档……" |
|
import glob |
|
import os |
|
|
|
|
|
chatbot.append(["函数插件功能?", CRAZY_FUNCTION_INFO]) |
|
yield from update_ui(chatbot=chatbot, history=history) |
|
|
|
|
|
try: |
|
import pdfminer, bs4 |
|
except: |
|
report_execption(chatbot, history, |
|
a = f"解析项目: {txt}", |
|
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pdfminer beautifulsoup4```。") |
|
yield from update_ui(chatbot=chatbot, history=history) |
|
return |
|
|
|
|
|
history = [] |
|
|
|
|
|
try: |
|
pdf_path, info = download_arxiv_(txt) |
|
except: |
|
report_execption(chatbot, history, |
|
a = f"解析项目: {txt}", |
|
b = f"下载pdf文件未成功") |
|
yield from update_ui(chatbot=chatbot, history=history) |
|
return |
|
|
|
|
|
i_say = f"请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。材料如下:{str(info)}" |
|
i_say_show_user = f'请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。论文:{pdf_path}' |
|
chatbot.append((i_say_show_user, "[Local Message] waiting gpt response.")) |
|
yield from update_ui(chatbot=chatbot, history=history) |
|
msg = '正常' |
|
|
|
gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, llm_kwargs, plugin_kwargs, history=[]) |
|
chatbot[-1] = (i_say_show_user, gpt_say) |
|
history.append(i_say_show_user); history.append(gpt_say) |
|
yield from update_ui(chatbot=chatbot, history=chatbot, msg=msg) |
|
|
|
import shutil |
|
|
|
shutil.copyfile(pdf_path, f'./gpt_log/{os.path.basename(pdf_path)}'); os.remove(pdf_path) |
|
res = write_results_to_file(history) |
|
chatbot.append(("完成了吗?", res + "\n\nPDF文件也已经下载")) |
|
yield from update_ui(chatbot=chatbot, history=chatbot, msg=msg) |
|
|
|
|