File size: 4,155 Bytes
4ca6239
 
b4231b6
4ca6239
 
 
 
7c1030f
 
b4231b6
 
7c1030f
 
 
 
 
 
 
 
 
 
 
b4231b6
7c1030f
 
4ca6239
 
b4231b6
4ca6239
 
 
 
 
 
 
b4231b6
4ca6239
 
 
 
 
 
 
b4231b6
 
4ca6239
7c1030f
b4231b6
4ca6239
 
 
 
 
 
 
7c1030f
 
 
4ca6239
264e85a
b4231b6
264e85a
 
 
 
 
 
 
 
 
 
7c1030f
 
b4231b6
 
 
 
 
7c1030f
b4231b6
2ca8f99
b4231b6
 
7c1030f
 
 
 
2ca8f99
 
7c1030f
 
 
 
 
 
 
 
 
 
2ca8f99
7c1030f
 
2ca8f99
 
 
b4231b6
2ca8f99
 
 
 
4ca6239
 
2ca8f99
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import requests
from bs4 import BeautifulSoup
import fitz  # pip install PyMuPDF
import os
import openai
import re
import gradio as gr
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import json

def connect_gspread(spread_sheet_key):
    """Google スプレッドシートに接続。"""
    credentials_json = os.getenv('GOOGLE_CREDENTIALS')
    credentials_dict = json.loads(credentials_json)
    scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
    
    credentials = ServiceAccountCredentials.from_json_keyfile_dict(credentials_dict, scope)
    gc = gspread.authorize(credentials)
    SPREADSHEET_KEY = spread_sheet_key
    worksheet = gc.open_by_key(SPREADSHEET_KEY).sheet1
    return worksheet

spread_sheet_key = "1nSh6D_Gqdbhi1CB3wvD4OJUU6bji8-LE6HET7NTEjrM"
worksheet = connect_gspread(spread_sheet_key)

def download_paper(paper_url):
    """論文PDFをダウンロードして保存。"""
    response = requests.get(paper_url)
    temp_pdf_path = "temp_paper.pdf"
    with open(temp_pdf_path, 'wb') as f:
        f.write(response.content)
    return temp_pdf_path

def extract_text_from_pdf(pdf_path):
    """PDFからテキストを抽出。"""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def summarize_text_with_chat(text, max_length=10000):
    """OpenAIのChat APIを使ってテキストを要約。"""
    openai.api_key = os.getenv('OPENAI_API_KEY')
    trimmed_text = text[:max_length]
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=[
            {"role": "system", "content": "次の文書を要約してください。必ず'## タイトル', '## 要約', '## 専門用語解説'を記載してください。"},
            {"role": "user", "content": trimmed_text}
        ],
        temperature=0.7,
        max_tokens=1000
    )
    summary_text = response.choices[0].message.content
    total_token = response.usage.total_tokens
    return summary_text, total_token

def fetch_paper_links(url):
    """指定したURLから論文のリンクを抽出し、重複を排除。"""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    pattern = re.compile(r'^/papers/\d+\.\d+$')
    links = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        if pattern.match(href) and href not in links:
            links.append(href)
    return links

def summarize_paper_and_save_to_sheet(paper_id):
    """論文を要約し、結果をGoogle スプレッドシートに保存。"""
    paper_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
    pdf_path = download_paper(paper_url)
    text = extract_text_from_pdf(pdf_path)
    summary = summarize_text_with_chat(text)
    os.remove(pdf_path)
    worksheet.append_row([paper_id, paper_url, summary])
    return summary

def gradio_interface():
    paper_links = fetch_paper_links("https://huggingface.co/papers")
    paper_ids = set(link.split('/')[-1] for link in paper_links)

    total_tokens_used = 0
    summaries = []

    for paper_id in paper_ids:
        summary_info = ""
        try:
            summary, tokens_used = summarize_paper_and_save_to_sheet(paper_id)
            total_tokens_used += tokens_used
            paper_id_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
            summary_info += f'論文: {paper_id_url}\n{summary}\n'
        except Exception as e:
            summary_info += f"Error processing paper ID {paper_id}: {e}\n"
        
        summaries.append(summary_info)

    summaries_markdown = "\n---\n".join(summaries)  # 要約を水平線で区切る
    return summaries_markdown + f"\n全ての要約で使用されたトータルトークン数: {total_tokens_used}"

iface = gr.Interface(
    fn=gradio_interface,
    inputs=[],
    outputs=gr.Markdown(),
    title="論文要約ツール",
    description="[Daily Papers](https://huggingface.co/papers)に掲載された本日の論文を取得し、日本語で要約します。"
)

if __name__ == "__main__":
    iface.launch()