|
import requests |
|
from bs4 import BeautifulSoup |
|
import fitz |
|
import os |
|
import openai |
|
import re |
|
import gradio as gr |
|
|
|
def download_paper(paper_url): |
|
"""指定したURLから論文のPDFをダウンロードし、ローカルに一時ファイルとして保存する。""" |
|
response = requests.get(paper_url) |
|
temp_pdf_path = "temp_paper.pdf" |
|
with open(temp_pdf_path, 'wb') as f: |
|
f.write(response.content) |
|
return temp_pdf_path |
|
|
|
def extract_text_from_pdf(pdf_path): |
|
"""PDFファイルからテキストを抽出する。""" |
|
doc = fitz.open(pdf_path) |
|
text = "" |
|
for page in doc: |
|
text += page.get_text() |
|
return text |
|
|
|
def check_summary_exists(paper_id): |
|
"""指定した論文IDの要約が既に存在するか確認し、存在する場合はその内容を返す。""" |
|
summary_path = f"summaries/{paper_id}.txt" |
|
if os.path.exists(summary_path): |
|
with open(summary_path, 'r', encoding='utf-8') as file: |
|
return file.read() |
|
return None |
|
|
|
def save_summary(paper_id, summary): |
|
"""指定した論文IDの要約をファイルに保存する。""" |
|
os.makedirs('summaries', exist_ok=True) |
|
summary_path = f"summaries/{paper_id}.txt" |
|
with open(summary_path, 'w', encoding='utf-8') as file: |
|
file.write(summary) |
|
|
|
def summarize_paper(paper_id): |
|
"""論文IDを基に論文の内容を日本語で要約する。""" |
|
existing_summary = check_summary_exists(paper_id) |
|
if existing_summary: |
|
return existing_summary, 0 |
|
|
|
paper_url = f"https://arxiv.org/pdf/{paper_id}.pdf" |
|
pdf_path = download_paper(paper_url) |
|
text = extract_text_from_pdf(pdf_path) |
|
summary, tokens_used = summarize_text_with_chat(text) |
|
os.remove(pdf_path) |
|
|
|
save_summary(paper_id, summary) |
|
return summary, tokens_used |
|
|
|
def fetch_paper_links(url): |
|
"""指定したURLから特定の形式に完全にマッチするリンクを取得し、重複を排除する(順序保持)""" |
|
response = requests.get(url) |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
pattern = re.compile(r'^/papers/\d+\.\d+$') |
|
links = [] |
|
for a in soup.find_all('a', href=True): |
|
href = a['href'] |
|
if pattern.match(href) and href not in links: |
|
links.append(href) |
|
return links |
|
|
|
def summarize_text_with_chat(text, max_length=10000): |
|
"""テキストをOpenAIのChat APIを使用して要約する。""" |
|
openai.api_key = os.getenv('OPEN_AI_API_KEYS') |
|
trimmed_text = text[:max_length] |
|
response = openai.chat.completions.create( |
|
model="gpt-4-0125-preview", |
|
messages=[ |
|
{"role": "system", "content": "次の文書を要約してください。必ず'## タイトル', '## 要約', '## 専門用語解説'を記載してください。"}, |
|
{"role": "user", "content": trimmed_text} |
|
], |
|
temperature=0.7, |
|
max_tokens=1000 |
|
) |
|
summary_text = response.choices[0].message.content |
|
total_token = response.usage.total_tokens |
|
return summary_text, total_token |
|
|
|
def gradio_interface(): |
|
papers_url = 'https://huggingface.co/papers' |
|
paper_links = fetch_paper_links(papers_url) |
|
paper_ids = set(link.split('/')[-1] for link in paper_links) |
|
|
|
total_tokens_used = 0 |
|
summaries = [] |
|
|
|
for paper_id in paper_ids: |
|
summary_info = "" |
|
try: |
|
summary, tokens_used = summarize_paper(paper_id) |
|
total_tokens_used += tokens_used |
|
paper_id_url = f"https://arxiv.org/pdf/{paper_id}.pdf" |
|
summary_info += f'論文: {paper_id_url}\n{summary}\n' |
|
except Exception as e: |
|
summary_info += f"Error processing paper ID {paper_id}: {e}\n" |
|
|
|
summaries.append(summary_info) |
|
|
|
summaries_markdown = "\n---\n".join(summaries) |
|
return summaries_markdown + f"\n全ての要約で使用されたトータルトークン数: {total_tokens_used}" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=[], |
|
outputs=gr.Markdown(), |
|
title="論文要約ツール", |
|
description="[Daily Papers](https://huggingface.co/papers)に掲載された本日の論文を取得し、日本語で要約します。" |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |