HowardZhangdqs commited on
Commit
c65f6a0
·
verified ·
1 Parent(s): 03a843c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +382 -0
app.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import hashlib
3
+ import datetime
4
+ import requests
5
+ import os
6
+ import gradio as gr
7
+ from datetime import datetime, timedelta
8
+
9
+ from dataclasses import dataclass
10
+ from datetime import datetime
11
+ from typing import List, Optional, Any, Dict
12
+
13
+ # 修改后的数据类(添加 Optional 和默认值)
14
+
15
+
16
+ @dataclass
17
+ class Author:
18
+ _id: Optional[str] = None
19
+ name: Optional[str] = None
20
+ hidden: Optional[bool] = None
21
+
22
+
23
+ @dataclass
24
+ class Paper:
25
+ id: Optional[str] = None
26
+ authors: List[Author] = None
27
+ publishedAt: Optional[datetime] = None
28
+ title: Optional[str] = None
29
+ summary: Optional[str] = None
30
+ upvotes: Optional[int] = None
31
+ discussionId: Optional[str] = None
32
+
33
+
34
+ @dataclass
35
+ class SubmittedBy:
36
+ _id: Optional[str] = None
37
+ avatarUrl: Optional[str] = None
38
+ fullname: Optional[str] = None
39
+ name: Optional[str] = None
40
+ type: Optional[str] = None
41
+ isPro: Optional[bool] = None
42
+ isHf: Optional[bool] = None
43
+ isMod: Optional[bool] = None
44
+ followerCount: Optional[int] = None
45
+
46
+
47
+ @dataclass
48
+ class Article:
49
+ paper: Optional[Paper] = None
50
+ publishedAt: Optional[datetime] = None
51
+ title: Optional[str] = None
52
+ thumbnail: Optional[str] = None
53
+ numComments: Optional[int] = None
54
+ submittedBy: Optional[SubmittedBy] = None
55
+ isAuthorParticipating: Optional[bool] = None
56
+
57
+
58
+ def safe_get(data: Dict, *keys: str) -> Any:
59
+ """安全获取嵌套字典值"""
60
+ for key in keys:
61
+ data = data.get(key, {}) if isinstance(data, dict) else None
62
+ return data if data != {} else None
63
+
64
+
65
+ def parse_article(data: Dict[str, Any]) -> Article:
66
+ """容错式解析函数"""
67
+
68
+ def parse_datetime(dt_str: Optional[str]) -> Optional[datetime]:
69
+ """安全解析时间"""
70
+ if not dt_str:
71
+ return None
72
+ try:
73
+ if dt_str.endswith('Z'):
74
+ dt_str = dt_str[:-1] + '+00:00'
75
+ return datetime.fromisoformat(dt_str)
76
+ except ValueError:
77
+ return None
78
+
79
+ # 解析作者列表
80
+ authors = []
81
+ for author_data in safe_get(data, "paper", "authors") or []:
82
+ authors.append(Author(
83
+ _id=author_data.get("_id"),
84
+ name=author_data.get("name"),
85
+ hidden=author_data.get("hidden")
86
+ ))
87
+
88
+ # 解析论文
89
+ paper = Paper(
90
+ id=safe_get(data, "paper", "id"),
91
+ authors=authors,
92
+ publishedAt=parse_datetime(safe_get(data, "paper", "publishedAt")),
93
+ title=safe_get(data, "paper", "title"),
94
+ summary=safe_get(data, "paper", "summary"),
95
+ upvotes=safe_get(data, "paper", "upvotes"),
96
+ discussionId=safe_get(data, "paper", "discussionId")
97
+ ) if safe_get(data, "paper") else None
98
+
99
+ # 解析提交者
100
+ submitted_by_data = safe_get(data, "submittedBy")
101
+ submitted_by = SubmittedBy(
102
+ _id=submitted_by_data.get("_id") if submitted_by_data else None,
103
+ avatarUrl=submitted_by_data.get("avatarUrl") if submitted_by_data else None,
104
+ fullname=submitted_by_data.get("fullname") if submitted_by_data else None,
105
+ name=submitted_by_data.get("name") if submitted_by_data else None,
106
+ type=submitted_by_data.get("type") if submitted_by_data else None,
107
+ isPro=submitted_by_data.get("isPro") if submitted_by_data else None,
108
+ isHf=submitted_by_data.get("isHf") if submitted_by_data else None,
109
+ isMod=submitted_by_data.get("isMod") if submitted_by_data else None,
110
+ followerCount=submitted_by_data.get("followerCount") if submitted_by_data else None
111
+ ) if submitted_by_data else None
112
+
113
+ # 构建最终对象
114
+ return Article(
115
+ paper=paper,
116
+ publishedAt=parse_datetime(data.get("publishedAt")),
117
+ title=data.get("title"),
118
+ thumbnail=data.get("thumbnail"),
119
+ numComments=data.get("numComments"),
120
+ submittedBy=submitted_by,
121
+ isAuthorParticipating=data.get("isAuthorParticipating")
122
+ )
123
+
124
+
125
+ API_URL = "https://huggingface.co/api/daily_papers"
126
+
127
+ cache = {}
128
+
129
+
130
+ def make_request(url: str):
131
+ # Create a hash of the URL to use as the cache key
132
+ url_hash = hashlib.md5(url.encode()).hexdigest()
133
+
134
+ # Check if the response is already cached
135
+ if url_hash in cache:
136
+ print(f"Cache hit for URL: {url}")
137
+ return cache[url_hash]
138
+
139
+ http_proxy = os.getenv("HF_HTTP_PROXY")
140
+ https_proxy = os.getenv("HF_HTTPS_PROXY")
141
+ proxies = {
142
+ "http": http_proxy,
143
+ "https": https_proxy
144
+ } if http_proxy or https_proxy else None
145
+
146
+ attempts = 0
147
+ while attempts < 3:
148
+ try:
149
+ response = requests.get(url, proxies=proxies)
150
+ response.raise_for_status()
151
+ data = response.json()
152
+
153
+ # Cache the response
154
+ cache[url_hash] = data
155
+
156
+ return data
157
+ except requests.RequestException as e:
158
+ attempts += 1
159
+ print(f"Attempt {attempts} failed: {e}")
160
+ if attempts == 3:
161
+ return []
162
+
163
+
164
+ def fetch_papers():
165
+ data = make_request(API_URL)
166
+ return [parse_article(item) for item in data]
167
+
168
+
169
+ def fetch_papers_with_date(date: datetime):
170
+ formatted_date = date.strftime("%Y-%m-%d")
171
+ data = make_request(API_URL + "?date=" + formatted_date)
172
+ return [parse_article(item) for item in data]
173
+
174
+
175
+ def fetch_papers_with_daterange(start_date: datetime, end_date: datetime):
176
+ # return []
177
+ # 每天的数据都是独立的,所以只需要遍历日期范围即可
178
+ articles = []
179
+ current_date = start_date
180
+ while current_date <= end_date:
181
+ print(current_date)
182
+ articles.extend(fetch_papers_with_date(current_date))
183
+ print(f"Total articles: {len(articles)}")
184
+ current_date += datetime.timedelta(days=1)
185
+
186
+ # 根据每个文章的.paper.id去重
187
+ unique_articles = {}
188
+ for article in articles:
189
+ if article.paper.id not in unique_articles:
190
+ unique_articles[article.paper.id] = article
191
+
192
+ return list(unique_articles.values())
193
+
194
+
195
+ def sort_by_date(articles):
196
+ return sorted(articles, key=lambda x: x.publishedAt, reverse=True)
197
+
198
+
199
+ def sort_by_upvotes(articles):
200
+ return sorted(articles, key=lambda x: x.paper.upvotes, reverse=True)
201
+
202
+
203
+ def sort_by_comments(articles):
204
+ return sorted(articles, key=lambda x: x.numComments, reverse=True)
205
+
206
+
207
+ def format_author(author):
208
+ """格式化作者信息"""
209
+ if not author:
210
+ return ""
211
+ hidden_status = "(隐藏)" if author.hidden else ""
212
+ if author.name:
213
+ return f"<a href='https://scholar.google.com/citations?view_op=search_authors&mauthors={author.name.replace(' ', '+')}'>{author.name}</a>{hidden_status}"
214
+ return f"匿名作者{hidden_status}"
215
+
216
+
217
+ def format_paper_info(article):
218
+ """生成论文展示的 HTML 内容"""
219
+ if not article.paper:
220
+ return "论文信息缺失"
221
+
222
+ info = []
223
+ # 标题部分
224
+ info.append(f"<h2>{article.title or '无标题论文'}</h2>")
225
+
226
+ # 缩略图
227
+ if article.thumbnail:
228
+ info.append(f"<p><img src='{article.thumbnail}' style='max-width: 30em; width: 100%; margin: auto'/></p>")
229
+
230
+ # 基本信息
231
+ info.append(f"<p><strong>论文 ID</strong>:<a href='https://huggingface.co/papers/{article.paper.id}'>{article.paper.id or '未知'}</a></p>")
232
+ info.append(f"<p><strong>发布时间</strong>:{article.paper.publishedAt.strftime('%Y-%m-%d %H:%M') if article.paper.publishedAt else '未知'}</p>")
233
+
234
+ # 作者信息
235
+ authors = "、".join([format_author(a) for a in article.paper.authors]) if article.paper.authors else "作者信息暂缺"
236
+ info.append(f"<p><strong>作者</strong>:{authors}</p>")
237
+
238
+ # 摘要
239
+ if article.paper.summary:
240
+ summary = article.paper.summary.replace('{{', '{').replace('}}', '}').replace('\n', ' ')
241
+ info.append(f"<h3>摘要</h3><p>{summary}</p>")
242
+
243
+ # 讨论信息
244
+ info.append(f"<p><strong>点赞数</strong>:{article.paper.upvotes or 0}<span style='margin-left: .5rem'></span>")
245
+ info.append(f"<strong>评论数</strong>:{article.numComments or 0}</p>")
246
+ if article.paper.discussionId:
247
+ info.append(f"<a href='https://huggingface.co/papers/{article.paper.id}/discussion/{article.paper.discussionId}'>进入讨论</a></p>")
248
+
249
+ # 提交者信息
250
+ if article.submittedBy:
251
+ submitter = article.submittedBy
252
+ info.append(f"<hr><p><strong>提交者</strong>: ")
253
+ info.append(
254
+ f"<span><img src='{submitter.avatarUrl}' class='author' /></span>{submitter.fullname}(<a href='https://huggingface.co/{submitter.name}'>@{submitter.name}</a>) ")
255
+ info.append(f"粉丝数:{submitter.followerCount or 0}</p>")
256
+
257
+ return "".join(info)
258
+
259
+
260
+ def generate_table_html(papers):
261
+ """生成带可点击标题的表格 HTML"""
262
+ html = ['<table class="paper-table"><tr><th>标题</th><th>👍点赞</th><th>💬评论</th><th>📅日期</th></tr>']
263
+
264
+ for article in papers:
265
+ title = article.title or "无标题"
266
+ upvotes = article.paper.upvotes or 0
267
+ comments = article.numComments or 0
268
+ date = article.paper.publishedAt.strftime("%Y-%m-%d") if article.paper.publishedAt else "未知"
269
+ paper_id = article.paper.id
270
+
271
+ row = f"""
272
+ <tr>
273
+ <td><a class="paper-title" href="javascript:void(0)" onclick="showDetail('{paper_id}')">{title}</a></td>
274
+ <td>{upvotes}</td>
275
+ <td>{comments}</td>
276
+ <td>{date}</td>
277
+ </tr>
278
+ """
279
+ html.append(row)
280
+
281
+ html.append("</table>")
282
+ return "".join(html)
283
+
284
+
285
+ def build_html(papers):
286
+ # 将所有的papers转换为一个html字符串,每个paper用一个div包裹,div内部包含paper的信息,div的id为paper的id
287
+ html = ""
288
+ for article in papers:
289
+ article_html = format_paper_info(article)
290
+ html += f"<div id='smartflow-paper-{article.paper.id.replace('.', '-')}' style='display: none'>{article_html}</div>"
291
+ return html
292
+
293
+
294
+ def query_papers(start_date_str, end_date_str):
295
+ """处理日��查询"""
296
+ try:
297
+ start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
298
+ end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
299
+ papers = fetch_papers_with_daterange(start_date, end_date)
300
+ papers = sort_by_upvotes(papers)
301
+ return generate_table_html(papers), build_html(papers)
302
+ except Exception as e:
303
+ print(f"查询出错: {e}")
304
+ return "<p>⚠️ 查询失败,请检查日期格式(YYYY-MM-DD)</p>", "<p>⚠️ 查询失败,请检查日期格式(YYYY-MM-DD)</p>"
305
+
306
+
307
+ def show_detail(paper_id, papers):
308
+ """显示论文详情"""
309
+ if not papers:
310
+ return "请先进行查询"
311
+
312
+ return build_html(papers)
313
+
314
+
315
+ # CSS 样式(可放入单独文件)
316
+ custom_css = """
317
+ .paper-table { width: 100%; border-collapse: collapse; }
318
+ .paper-table td { padding: 12px; border-bottom: 1px solid #ddd; }
319
+ .paper-table th { font-weight: bold; background: #f9f9f920; }
320
+ .paper-table tr:hover { background: #f9f9f920; }
321
+ .paper-title { color: #1a73e8; cursor: pointer; text-decoration: none !important; }
322
+ .paper-title:hover { text-decoration: underline !important; }
323
+ .paper-table td:nth-child(2), .paper-table td:nth-child(3), .paper-table td:nth-child(4) { text-align: center; }
324
+ .paper-table th:nth-child(2), .paper-table th:nth-child(3), .paper-table th:nth-child(4) { text-align: center; }
325
+ .detail-area { margin-top: 20px; padding: 20px; border: 1px solid #ddd; border-radius: 5px; }
326
+ """
327
+
328
+ custom_js = """
329
+ function showDetail(paperId) {
330
+ // 隐藏 smartflow-paper-paperId 的所有兄弟节点
331
+ var siblings = document.querySelectorAll(`div[id^='smartflow-paper-']:not(#smartflow-paper-${paperId.replace('.', '-')})`);
332
+ siblings.forEach(sibling => sibling.style.display = 'none');
333
+
334
+ // 显示当前节点
335
+ var paper = document.getElementById(`smartflow-paper-${paperId.replace('.', '-')}`);
336
+ if (paper) {
337
+ paper.style.display = 'block';
338
+ }
339
+ }
340
+ """
341
+
342
+
343
+ def create_interface():
344
+ """创建新的界面布局"""
345
+ with gr.Blocks(title="Hugging Face Daily Paper", css=custom_css, head=f"<script>{custom_js}</script>") as app:
346
+
347
+ # 主界面
348
+ gr.Markdown("# 📚 Hugging Face Daily Paper")
349
+
350
+ # 查询控制区
351
+ with gr.Row():
352
+ start_date = gr.Textbox(label="起始日期", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
353
+ end_date = gr.Textbox(label="结束日期", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
354
+ query_btn = gr.Button("🔍 查询", variant="primary")
355
+
356
+ # 结果显示区
357
+ with gr.Column(visible=True):
358
+ results_html = gr.HTML(label="查询结果")
359
+
360
+ # 论文详情区
361
+ with gr.Column(visible=True, elem_classes="detail-area"):
362
+ gr.Markdown("## 论文详情")
363
+ detail_html = gr.HTML(elem_id="detail-html")
364
+
365
+ # 事件处理
366
+ query_btn.click(
367
+ fn=query_papers,
368
+ inputs=[start_date, end_date],
369
+ outputs=[results_html, detail_html]
370
+ )
371
+
372
+ return app
373
+
374
+
375
+ if __name__ == "__main__":
376
+ gr.close_all()
377
+ app = create_interface()
378
+ app.launch(
379
+ # server_name="localhost",
380
+ # server_port=7860,
381
+ # share=True
382
+ )