File size: 5,544 Bytes
4ca6239 b4231b6 4ca6239 7c1030f b4231b6 b3fd498 b4231b6 7c1030f b4231b6 7c1030f 4ca6239 b4231b6 4ca6239 b4231b6 4ca6239 cc9b548 b4231b6 8b9edf2 4ca6239 7c1030f cc9b548 4ca6239 cc9b548 4ca6239 7c1030f 4ca6239 264e85a b4231b6 264e85a 7c1030f b4231b6 4448985 b4231b6 4448985 2ca8f99 9f6d7f6 df20b27 9f6d7f6 df20b27 9f6d7f6 6eda06b 9db3235 b3fd498 9db3235 b3fd498 6eda06b 9db3235 6eda06b 9db3235 b3fd498 7c1030f 2ca8f99 7c1030f 9f6d7f6 9db3235 7c1030f 9f6d7f6 7c1030f 2ca8f99 7c1030f ffa54e5 2ca8f99 9db3235 6eda06b b3fd498 2ca8f99 b3fd498 2ca8f99 9db3235 ff1c266 2ca8f99 4ca6239 9db3235 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import requests
from bs4 import BeautifulSoup
import fitz # pip install PyMuPDF
import os
import openai
import re
import gradio as gr
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import json
from datetime import datetime
def connect_gspread(spread_sheet_key):
"""Google スプレッドシートに接続。"""
credentials_json = os.getenv('GOOGLE_CREDENTIALS')
credentials_dict = json.loads(credentials_json)
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_dict(credentials_dict, scope)
gc = gspread.authorize(credentials)
SPREADSHEET_KEY = spread_sheet_key
worksheet = gc.open_by_key(SPREADSHEET_KEY).sheet1
return worksheet
spread_sheet_key = "1nSh6D_Gqdbhi1CB3wvD4OJUU6bji8-LE6HET7NTEjrM"
worksheet = connect_gspread(spread_sheet_key)
def download_paper(paper_url):
"""論文PDFをダウンロードして保存。"""
response = requests.get(paper_url)
temp_pdf_path = "temp_paper.pdf"
with open(temp_pdf_path, 'wb') as f:
f.write(response.content)
return temp_pdf_path
def extract_text_from_pdf(pdf_path):
"""PDFからテキストを抽出。"""
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
def summarize_text_with_chat(text, max_length=20000):
"""OpenAIのChat APIを使ってテキストを要約。"""
openai.api_key = os.getenv('OPEN_AI_API_KEYS')
trimmed_text = text[:max_length]
response = openai.chat.completions.create(
model="gpt-4-0125-preview",
messages=[
{"role": "system", "content": "次の文書を要約してください。必ず'## タイトル', '## 要約', '## 専門用語解説'を記載してください。"},
{"role": "user", "content": trimmed_text}
],
temperature=0.7,
max_tokens=2000
)
summary_text = response.choices[0].message.content
total_token = response.usage.total_tokens
return summary_text, total_token
def fetch_paper_links(url):
"""指定したURLから論文のリンクを抽出し、重複を排除。"""
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
pattern = re.compile(r'^/papers/\d+\.\d+$')
links = []
for a in soup.find_all('a', href=True):
href = a['href']
if pattern.match(href) and href not in links:
links.append(href)
return links
def summarize_paper_and_save_to_sheet(paper_id):
"""論文を要約し、結果をGoogle スプレッドシートに保存。"""
paper_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
pdf_path = download_paper(paper_url)
text = extract_text_from_pdf(pdf_path)
summary, token = summarize_text_with_chat(text)
os.remove(pdf_path)
worksheet.append_row([paper_id, paper_url, summary, token])
return summary, token
def find_paper_in_sheet(paper_id):
"""スプレッドシートから指定されたpaper_idを検索し、該当する行があればその内容を返す。"""
records = worksheet.get_all_records()
paper_id_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
# 各行をループしてpaper_idを検索
for index, record in enumerate(records, start=2): # 行インデックスは1ではなく2から開始(ヘッダー行を除く)
if record['URL'] == paper_id_url:
return record['summary']
# 該当する行がない場合はNoneを返す
return None
def gradio_interface(selected_date_str, today_pressed):
# 現在の日付を YYYY-MM-DD 形式で取得
current_date_str = datetime.now().strftime("%Y-%m-%d")
if today_pressed:
paper_links = fetch_paper_links(f"https://huggingface.co/papers")
else:
try:
# 日付文字列を検証し、正しい場合はその値を使用
datetime.strptime(selected_date_str, "%Y-%m-%d")
date = selected_date_str
except ValueError:
# 日付形式が無効な場合、エラーメッセージを返します。
return "入力された日付が無効です。YYYY-MM-DD形式で入力してください。"
paper_links = fetch_paper_links(f"https://huggingface.co/papers?date={date}")
paper_ids = set(link.split('/')[-1] for link in paper_links)
total_tokens_used = 0
summaries = []
for paper_id in paper_ids:
summary_info = ""
summary = find_paper_in_sheet(paper_id)
if summary is None:
summary, tokens_used = summarize_paper_and_save_to_sheet(paper_id)
total_tokens_used += tokens_used
paper_id_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
summary_info += f'論文: {paper_id_url}\n{summary}\n'
summaries.append(summary_info)
summaries_markdown = "\n---\n".join(summaries) # 要約を水平線で区切る
return summaries_markdown
# Gradio インターフェースの設定
inputs = [
gr.Text(label="日付をYYYY-MM-DD形式で入力", placeholder="例: 2024-03-28"),
gr.Button("Today")
]
iface = gr.Interface(
fn=gradio_interface,
inputs=inputs,
outputs=gr.Markdown(),
title="Daily Papers 日本語要約ツール",
description="[Daily Papers](https://huggingface.co/papers)に掲載された論文を日本語で要約します。"
)
if __name__ == "__main__":
iface.launch()
|