File size: 7,243 Bytes
ea031ab
bf3eb0b
 
6bbc10f
bf3eb0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0666fec
bf3eb0b
 
 
 
 
 
 
0666fec
bf3eb0b
 
 
 
 
 
 
 
0666fec
bf3eb0b
 
 
 
 
 
 
 
 
 
 
 
0666fec
bf3eb0b
 
 
 
 
 
0666fec
bf3eb0b
 
6bbc10f
 
 
 
 
 
 
 
 
 
bf3eb0b
 
0666fec
bf3eb0b
 
 
 
 
 
0666fec
bf3eb0b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
from toolbox import update_ui
from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down, get_conf
import re, requests, unicodedata, os
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
def download_arxiv_(url_pdf):
    if 'arxiv.org' not in url_pdf:
        if ('.' in url_pdf) and ('/' not in url_pdf):
            new_url = 'https://arxiv.org/abs/'+url_pdf
            print('下载编号:', url_pdf, '自动定位:', new_url)
            # download_arxiv_(new_url)
            return download_arxiv_(new_url)
        else:
            print('不能识别的URL!')
            return None
    if 'abs' in url_pdf:
        url_pdf = url_pdf.replace('abs', 'pdf')
        url_pdf = url_pdf + '.pdf'

    url_abs = url_pdf.replace('.pdf', '').replace('pdf', 'abs')
    title, other_info = get_name(_url_=url_abs)

    paper_id = title.split()[0]  # '[1712.00559]'
    if '2' in other_info['year']:
        title = other_info['year'] + ' ' + title

    known_conf = ['NeurIPS', 'NIPS', 'Nature', 'Science', 'ICLR', 'AAAI']
    for k in known_conf:
        if k in other_info['comment']:
            title = k + ' ' + title

    download_dir = './gpt_log/arxiv/'
    os.makedirs(download_dir, exist_ok=True)

    title_str = title.replace('?', '?')\
        .replace(':', ':')\
        .replace('\"', '“')\
        .replace('\n', '')\
        .replace('  ', ' ')\
        .replace('  ', ' ')

    requests_pdf_url = url_pdf
    file_path = download_dir+title_str
    # if os.path.exists(file_path):
    #     print('返回缓存文件')
    #     return './gpt_log/arxiv/'+title_str

    print('下载中')
    proxies, = get_conf('proxies')
    r = requests.get(requests_pdf_url, proxies=proxies)
    with open(file_path, 'wb+') as f:
        f.write(r.content)
    print('下载完成')

    # print('输出下载命令:','aria2c -o \"%s\" %s'%(title_str,url_pdf))
    # subprocess.call('aria2c --all-proxy=\"172.18.116.150:11084\" -o \"%s\" %s'%(download_dir+title_str,url_pdf), shell=True)

    x = "%s  %s %s.bib" % (paper_id, other_info['year'], other_info['authors'])
    x = x.replace('?', '?')\
        .replace(':', ':')\
        .replace('\"', '“')\
        .replace('\n', '')\
        .replace('  ', ' ')\
        .replace('  ', ' ')
    return './gpt_log/arxiv/'+title_str, other_info


def get_name(_url_):
    import os
    from bs4 import BeautifulSoup
    print('正在获取文献名!')
    print(_url_)

    # arxiv_recall = {}
    # if os.path.exists('./arxiv_recall.pkl'):
    #     with open('./arxiv_recall.pkl', 'rb') as f:
    #         arxiv_recall = pickle.load(f)

    # if _url_ in arxiv_recall:
    #     print('在缓存中')
    #     return arxiv_recall[_url_]

    proxies, = get_conf('proxies')
    res = requests.get(_url_, proxies=proxies)

    bs = BeautifulSoup(res.text, 'html.parser')
    other_details = {}

    # get year
    try:
        year = bs.find_all(class_='dateline')[0].text
        year = re.search(r'(\d{4})', year, re.M | re.I).group(1)
        other_details['year'] = year
        abstract = bs.find_all(class_='abstract mathjax')[0].text
        other_details['abstract'] = abstract
    except:
        other_details['year'] = ''
        print('年份获取失败')

    # get author
    try:
        authors = bs.find_all(class_='authors')[0].text
        authors = authors.split('Authors:')[1]
        other_details['authors'] = authors
    except:
        other_details['authors'] = ''
        print('authors获取失败')

    # get comment
    try:
        comment = bs.find_all(class_='metatable')[0].text
        real_comment = None
        for item in comment.replace('\n', ' ').split('   '):
            if 'Comments' in item:
                real_comment = item
        if real_comment is not None:
            other_details['comment'] = real_comment
        else:
            other_details['comment'] = ''
    except:
        other_details['comment'] = ''
        print('年份获取失败')

    title_str = BeautifulSoup(
        res.text, 'html.parser').find('title').contents[0]
    print('获取成功:', title_str)
    # arxiv_recall[_url_] = (title_str+'.pdf', other_details)
    # with open('./arxiv_recall.pkl', 'wb') as f:
    #     pickle.dump(arxiv_recall, f)

    return title_str+'.pdf', other_details



@CatchException
def 下载arxiv论文并翻译摘要(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):

    CRAZY_FUNCTION_INFO = "下载arxiv论文并翻译摘要,函数插件作者[binary-husky]。正在提取摘要并下载PDF文档……"
    import glob
    import os

    # 基本信息:功能、贡献者
    chatbot.append(["函数插件功能?", CRAZY_FUNCTION_INFO])
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

    # 尝试导入依赖,如果缺少依赖,则给出安装建议
    try:
        import pdfminer, bs4
    except:
        report_execption(chatbot, history, 
            a = f"解析项目: {txt}", 
            b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pdfminer beautifulsoup4```。")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return

    # 清空历史,以免输入溢出
    history = []

    # 提取摘要,下载PDF文档
    try:
        pdf_path, info = download_arxiv_(txt)
    except:
        report_execption(chatbot, history, 
            a = f"解析项目: {txt}", 
            b = f"下载pdf文件未成功")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return
    
    # 翻译摘要等
    i_say =            f"请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。材料如下:{str(info)}"
    i_say_show_user =  f'请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。论文:{pdf_path}'
    chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
    msg = '正常'
    # ** gpt request **
    # 单线,获取文章meta信息
    gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
        inputs=i_say,
        inputs_show_user=i_say_show_user,
        llm_kwargs=llm_kwargs,
        chatbot=chatbot, history=[],
        sys_prompt="Your job is to collect information from materials and translate to Chinese。",
    )

    # gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, llm_kwargs, plugin_kwargs, history=[])   # 带超时倒计时
    chatbot[-1] = (i_say_show_user, gpt_say)
    history.append(i_say_show_user); history.append(gpt_say)
    yield from update_ui(chatbot=chatbot, history=chatbot, msg=msg) # 刷新界面
    # 写入文件
    import shutil
    # 重置文件的创建时间
    shutil.copyfile(pdf_path, f'./gpt_log/{os.path.basename(pdf_path)}'); os.remove(pdf_path)
    res = write_results_to_file(history)
    chatbot.append(("完成了吗?", res + "\n\nPDF文件也已经下载"))
    yield from update_ui(chatbot=chatbot, history=chatbot, msg=msg) # 刷新界面