DField commited on
Commit
b4231b6
·
verified ·
1 Parent(s): fef46f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -62
app.py CHANGED
@@ -1,13 +1,49 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
- import fitz # PyMuPDF
4
  import os
5
  import openai
6
  import re
7
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def download_paper(paper_url):
10
- """指定したURLから論文のPDFをダウンロードし、ローカルに一時ファイルとして保存する。"""
11
  response = requests.get(paper_url)
12
  temp_pdf_path = "temp_paper.pdf"
13
  with open(temp_pdf_path, 'wb') as f:
@@ -15,50 +51,19 @@ def download_paper(paper_url):
15
  return temp_pdf_path
16
 
17
  def extract_text_from_pdf(pdf_path):
18
- """PDFファイルからテキストを抽出する。"""
19
  doc = fitz.open(pdf_path)
20
  text = ""
21
  for page in doc:
22
  text += page.get_text()
23
  return text
24
 
25
- def check_and_read_summary(paper_id):
26
- """指定した論文IDの要約が既に存在するか確認し、存在する場合はその内容を返す。"""
27
- summary_path = os.path.join("summaries", f"{paper_id}.txt")
28
- if os.path.exists(summary_path):
29
- with open(summary_path, 'r', encoding='utf-8') as file:
30
- return file.read()
31
- else:
32
- return None
33
-
34
- def save_summary(paper_id, summary):
35
- """指定した論文IDの要約をファイルに保存する。"""
36
- os.makedirs('summaries', exist_ok=True)
37
- summary_path = os.path.join("summaries", f"{paper_id}.txt")
38
- with open(summary_path, 'w', encoding='utf-8') as file:
39
- file.write(summary)
40
-
41
- def summarize_paper(paper_id):
42
- """論文IDを基に論文の内容を日本語で要約する。"""
43
- existing_summary = check_and_read_summary(paper_id)
44
- if existing_summary is not None:
45
- return existing_summary, 0 # トークン使用量を0として返す
46
-
47
- paper_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
48
- pdf_path = download_paper(paper_url)
49
- text = extract_text_from_pdf(pdf_path)
50
- summary, tokens_used = summarize_text_with_chat(text)
51
- os.remove(pdf_path) # 一時ファイルを削除
52
-
53
- save_summary(paper_id, summary) # 新しい要約を保存
54
- return summary, tokens_used
55
-
56
  def summarize_text_with_chat(text, max_length=10000):
57
- """テキストをOpenAIのChat APIを使用して要約する。"""
58
- openai.api_key = os.getenv('OPEN_AI_API_KEYS')
59
  trimmed_text = text[:max_length]
60
- response = openai.chat.completions.create(
61
- model="gpt-4-0125-preview",
62
  messages=[
63
  {"role": "system", "content": "次の文書を要約してください。必ず'## タイトル', '## 要約', '## 専門用語解説'を記載してください。"},
64
  {"role": "user", "content": trimmed_text}
@@ -66,15 +71,13 @@ def summarize_text_with_chat(text, max_length=10000):
66
  temperature=0.7,
67
  max_tokens=1000
68
  )
69
- summary_text = response.choices[0].message.content
70
- total_token = response.usage.total_tokens
71
- return summary_text, total_token
72
 
73
  def fetch_paper_links(url):
74
- """指定したURLから特定の形式に完全にマッチするリンクを取得し��重複を排除する(順序保持)"""
75
  response = requests.get(url)
76
  soup = BeautifulSoup(response.text, 'html.parser')
77
- # パターンの開始(^)と終了($)を指定して、完全一致を検出
78
  pattern = re.compile(r'^/papers/\d+\.\d+$')
79
  links = []
80
  for a in soup.find_all('a', href=True):
@@ -83,33 +86,37 @@ def fetch_paper_links(url):
83
  links.append(href)
84
  return links
85
 
86
- def gradio_interface():
87
- papers_url = 'https://huggingface.co/papers' # デフォルトURL
88
- paper_links = fetch_paper_links(papers_url)
89
- paper_ids = set(link.split('/')[-1] for link in paper_links)
 
 
 
 
 
 
 
 
 
90
 
91
- total_tokens_used = 0
 
 
92
  summaries = []
 
 
93
 
94
  for paper_id in paper_ids:
95
- summary_info = ""
96
- try:
97
- summary, tokens_used = summarize_paper(paper_id)
98
- total_tokens_used += tokens_used
99
- paper_id_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
100
- summary_info += f'論文: {paper_id_url}\n{summary}\n'
101
- except Exception as e:
102
- summary_info += f"Error processing paper ID {paper_id}: {e}\n"
103
-
104
- summaries.append(summary_info)
105
 
106
- summaries_markdown = "\n---\n".join(summaries) # 要約を水平線で区切る
107
- return summaries_markdown + f"\n全ての要約で使用されたトータルトークン数: {total_tokens_used}"
108
 
109
- # Gradioインターフェースの設定
110
  iface = gr.Interface(
111
  fn=gradio_interface,
112
- inputs=[], # 入力部分を削除
113
  outputs=gr.Markdown(),
114
  title="論文要約ツール",
115
  description="[Daily Papers](https://huggingface.co/papers)に掲載された本日の論文を取得し、日本語で要約します。"
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
+ import fitz # pip install PyMuPDF
4
  import os
5
  import openai
6
  import re
7
  import gradio as gr
8
+ from google.oauth2.credentials import Credentials
9
+ from googleapiclient.discovery import build
10
+ from googleapiclient.http import MediaIoBaseUpload, MediaIoBaseDownload
11
+ import io
12
+ import json
13
+
14
+ def google_drive_authenticate():
15
+ """Google Driveの認証情報を読み込んでサービスオブジェクトを返す。"""
16
+ credentials_info = json.loads(os.getenv('GOOGLE_CREDENTIALS'))
17
+ credentials = Credentials.from_authorized_user_info(credentials_info)
18
+ service = build('drive', 'v3', credentials=credentials)
19
+ return service
20
+
21
+ def save_to_google_drive(service, folder_id, filename, content):
22
+ """Google Driveにファイルを保存。"""
23
+ file_metadata = {'name': filename, 'parents': [folder_id]}
24
+ media = MediaIoBaseUpload(io.BytesIO(content.encode()), mimetype='text/plain')
25
+ file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
26
+ return file.get('id')
27
+
28
+ def find_in_google_drive(service, folder_id, paper_id):
29
+ """Google Driveでファイルを検索し、内容を返す。"""
30
+ query = f"parents='{folder_id}' and name contains '{paper_id}' and trashed=false"
31
+ response = service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
32
+ if not response.get('files'):
33
+ return None
34
+ file_id = response.get('files')[0].get('id')
35
+ request = service.files().get_media(fileId=file_id)
36
+ fh = io.BytesIO()
37
+ downloader = MediaIoBaseDownload(fh, request)
38
+ done = False
39
+ while done is False:
40
+ _, done = downloader.next_chunk()
41
+ fh.seek(0)
42
+ content = fh.read().decode('utf-8')
43
+ return content
44
 
45
  def download_paper(paper_url):
46
+ """論文PDFをダウンロードして保存。"""
47
  response = requests.get(paper_url)
48
  temp_pdf_path = "temp_paper.pdf"
49
  with open(temp_pdf_path, 'wb') as f:
 
51
  return temp_pdf_path
52
 
53
  def extract_text_from_pdf(pdf_path):
54
+ """PDFからテキストを抽出。"""
55
  doc = fitz.open(pdf_path)
56
  text = ""
57
  for page in doc:
58
  text += page.get_text()
59
  return text
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def summarize_text_with_chat(text, max_length=10000):
62
+ """OpenAIのChat APIを使ってテキストを要約。"""
63
+ openai.api_key = os.getenv('OPENAI_API_KEY')
64
  trimmed_text = text[:max_length]
65
+ response = openai.ChatCompletion.create(
66
+ model="gpt-3.5-turbo-0125",
67
  messages=[
68
  {"role": "system", "content": "次の文書を要約してください。必ず'## タイトル', '## 要約', '## 専門用語解説'を記載してください。"},
69
  {"role": "user", "content": trimmed_text}
 
71
  temperature=0.7,
72
  max_tokens=1000
73
  )
74
+ summary_text = response.choices[0].message['content']
75
+ return summary_text
 
76
 
77
  def fetch_paper_links(url):
78
+ """指定したURLから論文のリンクを抽出し、重複を排除。"""
79
  response = requests.get(url)
80
  soup = BeautifulSoup(response.text, 'html.parser')
 
81
  pattern = re.compile(r'^/papers/\d+\.\d+$')
82
  links = []
83
  for a in soup.find_all('a', href=True):
 
86
  links.append(href)
87
  return links
88
 
89
+ def summarize_paper(paper_id, service, folder_id):
90
+ """Google Driveで要約を検索または新たに生成して保存。"""
91
+ existing_summary = find_in_google_drive(service, folder_id, paper_id)
92
+ if existing_summary:
93
+ return existing_summary
94
+ paper_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
95
+ pdf_path = download_paper(paper_url)
96
+ text = extract_text_from_pdf(pdf_path)
97
+ summary = summarize_text_with_chat(text)
98
+ os.remove(pdf_path)
99
+ filename = f"{paper_id}_summary.txt"
100
+ save_to_google_drive(service, folder_id, filename, summary)
101
+ return summary
102
 
103
+ def gradio_interface():
104
+ service = google_drive_authenticate()
105
+ folder_id = '1yOXimp4kk7eohWKGtVo-gn93M0A404TM'
106
  summaries = []
107
+ paper_links = fetch_paper_links("https://huggingface.co/papers")
108
+ paper_ids = [link.split('/')[-1] for link in paper_links]
109
 
110
  for paper_id in paper_ids:
111
+ summary = summarize_paper(paper_id, service, folder_id)
112
+ summaries.append(summary)
 
 
 
 
 
 
 
 
113
 
114
+ summaries_markdown = "\n---\n".join(summaries)
115
+ return summaries_markdown
116
 
 
117
  iface = gr.Interface(
118
  fn=gradio_interface,
119
+ inputs=[],
120
  outputs=gr.Markdown(),
121
  title="論文要約ツール",
122
  description="[Daily Papers](https://huggingface.co/papers)に掲載された本日の論文を取得し、日本語で要約します。"