|
import requests |
|
from bs4 import BeautifulSoup |
|
import fitz |
|
import os |
|
import openai |
|
import re |
|
import gradio as gr |
|
import gspread |
|
from oauth2client.service_account import ServiceAccountCredentials |
|
import json |
|
|
|
def connect_gspread(spread_sheet_key): |
|
"""Google スプレッドシートに接続。""" |
|
credentials_json = os.getenv('GOOGLE_CREDENTIALS') |
|
credentials_dict = json.loads(credentials_json) |
|
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] |
|
|
|
credentials = ServiceAccountCredentials.from_json_keyfile_dict(credentials_dict, scope) |
|
gc = gspread.authorize(credentials) |
|
SPREADSHEET_KEY = spread_sheet_key |
|
worksheet = gc.open_by_key(SPREADSHEET_KEY).sheet1 |
|
return worksheet |
|
|
|
spread_sheet_key = "1nSh6D_Gqdbhi1CB3wvD4OJUU6bji8-LE6HET7NTEjrM" |
|
worksheet = connect_gspread(spread_sheet_key) |
|
|
|
def download_paper(paper_url): |
|
"""論文PDFをダウンロードして保存。""" |
|
response = requests.get(paper_url) |
|
temp_pdf_path = "temp_paper.pdf" |
|
with open(temp_pdf_path, 'wb') as f: |
|
f.write(response.content) |
|
return temp_pdf_path |
|
|
|
def extract_text_from_pdf(pdf_path): |
|
"""PDFからテキストを抽出。""" |
|
doc = fitz.open(pdf_path) |
|
text = "" |
|
for page in doc: |
|
text += page.get_text() |
|
return text |
|
|
|
def summarize_text_with_chat(text, max_length=20000): |
|
"""OpenAIのChat APIを使ってテキストを要約。""" |
|
openai.api_key = os.getenv('OPEN_AI_API_KEYS') |
|
trimmed_text = text[:max_length] |
|
response = openai.chat.completions.create( |
|
model="gpt-4-0125-preview", |
|
messages=[ |
|
{"role": "system", "content": "次の文書を要約してください。必ず'## タイトル', '## 要約', '## 専門用語解説'を記載してください。"}, |
|
{"role": "user", "content": trimmed_text} |
|
], |
|
temperature=0.7, |
|
max_tokens=2000 |
|
) |
|
summary_text = response.choices[0].message.content |
|
total_token = response.usage.total_tokens |
|
return summary_text, total_token |
|
|
|
def fetch_paper_links(url): |
|
"""指定したURLから論文のリンクを抽出し、重複を排除。""" |
|
response = requests.get(url) |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
pattern = re.compile(r'^/papers/\d+\.\d+$') |
|
links = [] |
|
for a in soup.find_all('a', href=True): |
|
href = a['href'] |
|
if pattern.match(href) and href not in links: |
|
links.append(href) |
|
return links |
|
|
|
def summarize_paper_and_save_to_sheet(paper_id): |
|
"""論文を要約し、結果をGoogle スプレッドシートに保存。""" |
|
paper_url = f"https://arxiv.org/pdf/{paper_id}.pdf" |
|
pdf_path = download_paper(paper_url) |
|
text = extract_text_from_pdf(pdf_path) |
|
summary, token = summarize_text_with_chat(text) |
|
os.remove(pdf_path) |
|
worksheet.append_row([paper_id, paper_url, summary, token]) |
|
return summary, token |
|
|
|
def find_paper_in_sheet(paper_id): |
|
"""スプレッドシートから指定されたpaper_idを検索し、該当する行があればその内容を返す。""" |
|
records = worksheet.get_all_records() |
|
paper_id_url = f"https://arxiv.org/pdf/{paper_id}.pdf" |
|
|
|
|
|
for index, record in enumerate(records, start=2): |
|
if record['URL'] == paper_id_url: |
|
return record['summary'] |
|
|
|
return None |
|
|
|
def gradio_interface(): |
|
paper_links = fetch_paper_links("https://huggingface.co/papers") |
|
paper_ids = set(link.split('/')[-1] for link in paper_links) |
|
|
|
total_tokens_used = 0 |
|
summaries = [] |
|
|
|
for paper_id in paper_ids: |
|
summary_info = "" |
|
summary = find_paper_in_sheet(paper_id) |
|
if summary == None: |
|
summary, tokens_used = summarize_paper_and_save_to_sheet(paper_id) |
|
total_tokens_used += tokens_used |
|
|
|
paper_id_url = f"https://arxiv.org/pdf/{paper_id}.pdf" |
|
summary_info += f'論文: {paper_id_url}\n{summary}\n' |
|
|
|
summaries.append(summary_info) |
|
|
|
summaries_markdown = "\n---\n".join(summaries) |
|
return summaries_markdown |
|
|
|
iface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=[], |
|
outputs=gr.Markdown(), |
|
title="Dairy Papers 日本語要約ツール", |
|
description="[Daily Papers](https://huggingface.co/papers)に掲載された論文を日本語で要約します。" |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |