Spaces:
No application file
No application file
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,382 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import hashlib
|
3 |
+
import datetime
|
4 |
+
import requests
|
5 |
+
import os
|
6 |
+
import gradio as gr
|
7 |
+
from datetime import datetime, timedelta
|
8 |
+
|
9 |
+
from dataclasses import dataclass
|
10 |
+
from datetime import datetime
|
11 |
+
from typing import List, Optional, Any, Dict
|
12 |
+
|
13 |
+
# 修改后的数据类(添加 Optional 和默认值)
|
14 |
+
|
15 |
+
|
16 |
+
@dataclass
|
17 |
+
class Author:
|
18 |
+
_id: Optional[str] = None
|
19 |
+
name: Optional[str] = None
|
20 |
+
hidden: Optional[bool] = None
|
21 |
+
|
22 |
+
|
23 |
+
@dataclass
|
24 |
+
class Paper:
|
25 |
+
id: Optional[str] = None
|
26 |
+
authors: List[Author] = None
|
27 |
+
publishedAt: Optional[datetime] = None
|
28 |
+
title: Optional[str] = None
|
29 |
+
summary: Optional[str] = None
|
30 |
+
upvotes: Optional[int] = None
|
31 |
+
discussionId: Optional[str] = None
|
32 |
+
|
33 |
+
|
34 |
+
@dataclass
|
35 |
+
class SubmittedBy:
|
36 |
+
_id: Optional[str] = None
|
37 |
+
avatarUrl: Optional[str] = None
|
38 |
+
fullname: Optional[str] = None
|
39 |
+
name: Optional[str] = None
|
40 |
+
type: Optional[str] = None
|
41 |
+
isPro: Optional[bool] = None
|
42 |
+
isHf: Optional[bool] = None
|
43 |
+
isMod: Optional[bool] = None
|
44 |
+
followerCount: Optional[int] = None
|
45 |
+
|
46 |
+
|
47 |
+
@dataclass
|
48 |
+
class Article:
|
49 |
+
paper: Optional[Paper] = None
|
50 |
+
publishedAt: Optional[datetime] = None
|
51 |
+
title: Optional[str] = None
|
52 |
+
thumbnail: Optional[str] = None
|
53 |
+
numComments: Optional[int] = None
|
54 |
+
submittedBy: Optional[SubmittedBy] = None
|
55 |
+
isAuthorParticipating: Optional[bool] = None
|
56 |
+
|
57 |
+
|
58 |
+
def safe_get(data: Dict, *keys: str) -> Any:
|
59 |
+
"""安全获取嵌套字典值"""
|
60 |
+
for key in keys:
|
61 |
+
data = data.get(key, {}) if isinstance(data, dict) else None
|
62 |
+
return data if data != {} else None
|
63 |
+
|
64 |
+
|
65 |
+
def parse_article(data: Dict[str, Any]) -> Article:
|
66 |
+
"""容错式解析函数"""
|
67 |
+
|
68 |
+
def parse_datetime(dt_str: Optional[str]) -> Optional[datetime]:
|
69 |
+
"""安全解析时间"""
|
70 |
+
if not dt_str:
|
71 |
+
return None
|
72 |
+
try:
|
73 |
+
if dt_str.endswith('Z'):
|
74 |
+
dt_str = dt_str[:-1] + '+00:00'
|
75 |
+
return datetime.fromisoformat(dt_str)
|
76 |
+
except ValueError:
|
77 |
+
return None
|
78 |
+
|
79 |
+
# 解析作者列表
|
80 |
+
authors = []
|
81 |
+
for author_data in safe_get(data, "paper", "authors") or []:
|
82 |
+
authors.append(Author(
|
83 |
+
_id=author_data.get("_id"),
|
84 |
+
name=author_data.get("name"),
|
85 |
+
hidden=author_data.get("hidden")
|
86 |
+
))
|
87 |
+
|
88 |
+
# 解析论文
|
89 |
+
paper = Paper(
|
90 |
+
id=safe_get(data, "paper", "id"),
|
91 |
+
authors=authors,
|
92 |
+
publishedAt=parse_datetime(safe_get(data, "paper", "publishedAt")),
|
93 |
+
title=safe_get(data, "paper", "title"),
|
94 |
+
summary=safe_get(data, "paper", "summary"),
|
95 |
+
upvotes=safe_get(data, "paper", "upvotes"),
|
96 |
+
discussionId=safe_get(data, "paper", "discussionId")
|
97 |
+
) if safe_get(data, "paper") else None
|
98 |
+
|
99 |
+
# 解析提交者
|
100 |
+
submitted_by_data = safe_get(data, "submittedBy")
|
101 |
+
submitted_by = SubmittedBy(
|
102 |
+
_id=submitted_by_data.get("_id") if submitted_by_data else None,
|
103 |
+
avatarUrl=submitted_by_data.get("avatarUrl") if submitted_by_data else None,
|
104 |
+
fullname=submitted_by_data.get("fullname") if submitted_by_data else None,
|
105 |
+
name=submitted_by_data.get("name") if submitted_by_data else None,
|
106 |
+
type=submitted_by_data.get("type") if submitted_by_data else None,
|
107 |
+
isPro=submitted_by_data.get("isPro") if submitted_by_data else None,
|
108 |
+
isHf=submitted_by_data.get("isHf") if submitted_by_data else None,
|
109 |
+
isMod=submitted_by_data.get("isMod") if submitted_by_data else None,
|
110 |
+
followerCount=submitted_by_data.get("followerCount") if submitted_by_data else None
|
111 |
+
) if submitted_by_data else None
|
112 |
+
|
113 |
+
# 构建最终对象
|
114 |
+
return Article(
|
115 |
+
paper=paper,
|
116 |
+
publishedAt=parse_datetime(data.get("publishedAt")),
|
117 |
+
title=data.get("title"),
|
118 |
+
thumbnail=data.get("thumbnail"),
|
119 |
+
numComments=data.get("numComments"),
|
120 |
+
submittedBy=submitted_by,
|
121 |
+
isAuthorParticipating=data.get("isAuthorParticipating")
|
122 |
+
)
|
123 |
+
|
124 |
+
|
125 |
+
API_URL = "https://huggingface.co/api/daily_papers"
|
126 |
+
|
127 |
+
cache = {}
|
128 |
+
|
129 |
+
|
130 |
+
def make_request(url: str):
|
131 |
+
# Create a hash of the URL to use as the cache key
|
132 |
+
url_hash = hashlib.md5(url.encode()).hexdigest()
|
133 |
+
|
134 |
+
# Check if the response is already cached
|
135 |
+
if url_hash in cache:
|
136 |
+
print(f"Cache hit for URL: {url}")
|
137 |
+
return cache[url_hash]
|
138 |
+
|
139 |
+
http_proxy = os.getenv("HF_HTTP_PROXY")
|
140 |
+
https_proxy = os.getenv("HF_HTTPS_PROXY")
|
141 |
+
proxies = {
|
142 |
+
"http": http_proxy,
|
143 |
+
"https": https_proxy
|
144 |
+
} if http_proxy or https_proxy else None
|
145 |
+
|
146 |
+
attempts = 0
|
147 |
+
while attempts < 3:
|
148 |
+
try:
|
149 |
+
response = requests.get(url, proxies=proxies)
|
150 |
+
response.raise_for_status()
|
151 |
+
data = response.json()
|
152 |
+
|
153 |
+
# Cache the response
|
154 |
+
cache[url_hash] = data
|
155 |
+
|
156 |
+
return data
|
157 |
+
except requests.RequestException as e:
|
158 |
+
attempts += 1
|
159 |
+
print(f"Attempt {attempts} failed: {e}")
|
160 |
+
if attempts == 3:
|
161 |
+
return []
|
162 |
+
|
163 |
+
|
164 |
+
def fetch_papers():
|
165 |
+
data = make_request(API_URL)
|
166 |
+
return [parse_article(item) for item in data]
|
167 |
+
|
168 |
+
|
169 |
+
def fetch_papers_with_date(date: datetime):
|
170 |
+
formatted_date = date.strftime("%Y-%m-%d")
|
171 |
+
data = make_request(API_URL + "?date=" + formatted_date)
|
172 |
+
return [parse_article(item) for item in data]
|
173 |
+
|
174 |
+
|
175 |
+
def fetch_papers_with_daterange(start_date: datetime, end_date: datetime):
|
176 |
+
# return []
|
177 |
+
# 每天的数据都是独立的,所以只需要遍历日期范围即可
|
178 |
+
articles = []
|
179 |
+
current_date = start_date
|
180 |
+
while current_date <= end_date:
|
181 |
+
print(current_date)
|
182 |
+
articles.extend(fetch_papers_with_date(current_date))
|
183 |
+
print(f"Total articles: {len(articles)}")
|
184 |
+
current_date += datetime.timedelta(days=1)
|
185 |
+
|
186 |
+
# 根据每个文章的.paper.id去重
|
187 |
+
unique_articles = {}
|
188 |
+
for article in articles:
|
189 |
+
if article.paper.id not in unique_articles:
|
190 |
+
unique_articles[article.paper.id] = article
|
191 |
+
|
192 |
+
return list(unique_articles.values())
|
193 |
+
|
194 |
+
|
195 |
+
def sort_by_date(articles):
|
196 |
+
return sorted(articles, key=lambda x: x.publishedAt, reverse=True)
|
197 |
+
|
198 |
+
|
199 |
+
def sort_by_upvotes(articles):
|
200 |
+
return sorted(articles, key=lambda x: x.paper.upvotes, reverse=True)
|
201 |
+
|
202 |
+
|
203 |
+
def sort_by_comments(articles):
|
204 |
+
return sorted(articles, key=lambda x: x.numComments, reverse=True)
|
205 |
+
|
206 |
+
|
207 |
+
def format_author(author):
|
208 |
+
"""格式化作者信息"""
|
209 |
+
if not author:
|
210 |
+
return ""
|
211 |
+
hidden_status = "(隐藏)" if author.hidden else ""
|
212 |
+
if author.name:
|
213 |
+
return f"<a href='https://scholar.google.com/citations?view_op=search_authors&mauthors={author.name.replace(' ', '+')}'>{author.name}</a>{hidden_status}"
|
214 |
+
return f"匿名作者{hidden_status}"
|
215 |
+
|
216 |
+
|
217 |
+
def format_paper_info(article):
|
218 |
+
"""生成论文展示的 HTML 内容"""
|
219 |
+
if not article.paper:
|
220 |
+
return "论文信息缺失"
|
221 |
+
|
222 |
+
info = []
|
223 |
+
# 标题部分
|
224 |
+
info.append(f"<h2>{article.title or '无标题论文'}</h2>")
|
225 |
+
|
226 |
+
# 缩略图
|
227 |
+
if article.thumbnail:
|
228 |
+
info.append(f"<p><img src='{article.thumbnail}' style='max-width: 30em; width: 100%; margin: auto'/></p>")
|
229 |
+
|
230 |
+
# 基本信息
|
231 |
+
info.append(f"<p><strong>论文 ID</strong>:<a href='https://huggingface.co/papers/{article.paper.id}'>{article.paper.id or '未知'}</a></p>")
|
232 |
+
info.append(f"<p><strong>发布时间</strong>:{article.paper.publishedAt.strftime('%Y-%m-%d %H:%M') if article.paper.publishedAt else '未知'}</p>")
|
233 |
+
|
234 |
+
# 作者信息
|
235 |
+
authors = "、".join([format_author(a) for a in article.paper.authors]) if article.paper.authors else "作者信息暂缺"
|
236 |
+
info.append(f"<p><strong>作者</strong>:{authors}</p>")
|
237 |
+
|
238 |
+
# 摘要
|
239 |
+
if article.paper.summary:
|
240 |
+
summary = article.paper.summary.replace('{{', '{').replace('}}', '}').replace('\n', ' ')
|
241 |
+
info.append(f"<h3>摘要</h3><p>{summary}</p>")
|
242 |
+
|
243 |
+
# 讨论信息
|
244 |
+
info.append(f"<p><strong>点赞数</strong>:{article.paper.upvotes or 0}<span style='margin-left: .5rem'></span>")
|
245 |
+
info.append(f"<strong>评论数</strong>:{article.numComments or 0}</p>")
|
246 |
+
if article.paper.discussionId:
|
247 |
+
info.append(f"<a href='https://huggingface.co/papers/{article.paper.id}/discussion/{article.paper.discussionId}'>进入讨论</a></p>")
|
248 |
+
|
249 |
+
# 提交者信息
|
250 |
+
if article.submittedBy:
|
251 |
+
submitter = article.submittedBy
|
252 |
+
info.append(f"<hr><p><strong>提交者</strong>: ")
|
253 |
+
info.append(
|
254 |
+
f"<span><img src='{submitter.avatarUrl}' class='author' /></span>{submitter.fullname}(<a href='https://huggingface.co/{submitter.name}'>@{submitter.name}</a>) ")
|
255 |
+
info.append(f"粉丝数:{submitter.followerCount or 0}</p>")
|
256 |
+
|
257 |
+
return "".join(info)
|
258 |
+
|
259 |
+
|
260 |
+
def generate_table_html(papers):
|
261 |
+
"""生成带可点击标题的表格 HTML"""
|
262 |
+
html = ['<table class="paper-table"><tr><th>标题</th><th>👍点赞</th><th>💬评论</th><th>📅日期</th></tr>']
|
263 |
+
|
264 |
+
for article in papers:
|
265 |
+
title = article.title or "无标题"
|
266 |
+
upvotes = article.paper.upvotes or 0
|
267 |
+
comments = article.numComments or 0
|
268 |
+
date = article.paper.publishedAt.strftime("%Y-%m-%d") if article.paper.publishedAt else "未知"
|
269 |
+
paper_id = article.paper.id
|
270 |
+
|
271 |
+
row = f"""
|
272 |
+
<tr>
|
273 |
+
<td><a class="paper-title" href="javascript:void(0)" onclick="showDetail('{paper_id}')">{title}</a></td>
|
274 |
+
<td>{upvotes}</td>
|
275 |
+
<td>{comments}</td>
|
276 |
+
<td>{date}</td>
|
277 |
+
</tr>
|
278 |
+
"""
|
279 |
+
html.append(row)
|
280 |
+
|
281 |
+
html.append("</table>")
|
282 |
+
return "".join(html)
|
283 |
+
|
284 |
+
|
285 |
+
def build_html(papers):
|
286 |
+
# 将所有的papers转换为一个html字符串,每个paper用一个div包裹,div内部包含paper的信息,div的id为paper的id
|
287 |
+
html = ""
|
288 |
+
for article in papers:
|
289 |
+
article_html = format_paper_info(article)
|
290 |
+
html += f"<div id='smartflow-paper-{article.paper.id.replace('.', '-')}' style='display: none'>{article_html}</div>"
|
291 |
+
return html
|
292 |
+
|
293 |
+
|
294 |
+
def query_papers(start_date_str, end_date_str):
|
295 |
+
"""处理日��查询"""
|
296 |
+
try:
|
297 |
+
start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
|
298 |
+
end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
|
299 |
+
papers = fetch_papers_with_daterange(start_date, end_date)
|
300 |
+
papers = sort_by_upvotes(papers)
|
301 |
+
return generate_table_html(papers), build_html(papers)
|
302 |
+
except Exception as e:
|
303 |
+
print(f"查询出错: {e}")
|
304 |
+
return "<p>⚠️ 查询失败,请检查日期格式(YYYY-MM-DD)</p>", "<p>⚠️ 查询失败,请检查日期格式(YYYY-MM-DD)</p>"
|
305 |
+
|
306 |
+
|
307 |
+
def show_detail(paper_id, papers):
|
308 |
+
"""显示论文详情"""
|
309 |
+
if not papers:
|
310 |
+
return "请先进行查询"
|
311 |
+
|
312 |
+
return build_html(papers)
|
313 |
+
|
314 |
+
|
315 |
+
# CSS 样式(可放入单独文件)
|
316 |
+
custom_css = """
|
317 |
+
.paper-table { width: 100%; border-collapse: collapse; }
|
318 |
+
.paper-table td { padding: 12px; border-bottom: 1px solid #ddd; }
|
319 |
+
.paper-table th { font-weight: bold; background: #f9f9f920; }
|
320 |
+
.paper-table tr:hover { background: #f9f9f920; }
|
321 |
+
.paper-title { color: #1a73e8; cursor: pointer; text-decoration: none !important; }
|
322 |
+
.paper-title:hover { text-decoration: underline !important; }
|
323 |
+
.paper-table td:nth-child(2), .paper-table td:nth-child(3), .paper-table td:nth-child(4) { text-align: center; }
|
324 |
+
.paper-table th:nth-child(2), .paper-table th:nth-child(3), .paper-table th:nth-child(4) { text-align: center; }
|
325 |
+
.detail-area { margin-top: 20px; padding: 20px; border: 1px solid #ddd; border-radius: 5px; }
|
326 |
+
"""
|
327 |
+
|
328 |
+
custom_js = """
|
329 |
+
function showDetail(paperId) {
|
330 |
+
// 隐藏 smartflow-paper-paperId 的所有兄弟节点
|
331 |
+
var siblings = document.querySelectorAll(`div[id^='smartflow-paper-']:not(#smartflow-paper-${paperId.replace('.', '-')})`);
|
332 |
+
siblings.forEach(sibling => sibling.style.display = 'none');
|
333 |
+
|
334 |
+
// 显示当前节点
|
335 |
+
var paper = document.getElementById(`smartflow-paper-${paperId.replace('.', '-')}`);
|
336 |
+
if (paper) {
|
337 |
+
paper.style.display = 'block';
|
338 |
+
}
|
339 |
+
}
|
340 |
+
"""
|
341 |
+
|
342 |
+
|
343 |
+
def create_interface():
|
344 |
+
"""创建新的界面布局"""
|
345 |
+
with gr.Blocks(title="Hugging Face Daily Paper", css=custom_css, head=f"<script>{custom_js}</script>") as app:
|
346 |
+
|
347 |
+
# 主界面
|
348 |
+
gr.Markdown("# 📚 Hugging Face Daily Paper")
|
349 |
+
|
350 |
+
# 查询控制区
|
351 |
+
with gr.Row():
|
352 |
+
start_date = gr.Textbox(label="起始日期", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
|
353 |
+
end_date = gr.Textbox(label="结束日期", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
|
354 |
+
query_btn = gr.Button("🔍 查询", variant="primary")
|
355 |
+
|
356 |
+
# 结果显示区
|
357 |
+
with gr.Column(visible=True):
|
358 |
+
results_html = gr.HTML(label="查询结果")
|
359 |
+
|
360 |
+
# 论文详情区
|
361 |
+
with gr.Column(visible=True, elem_classes="detail-area"):
|
362 |
+
gr.Markdown("## 论文详情")
|
363 |
+
detail_html = gr.HTML(elem_id="detail-html")
|
364 |
+
|
365 |
+
# 事件处理
|
366 |
+
query_btn.click(
|
367 |
+
fn=query_papers,
|
368 |
+
inputs=[start_date, end_date],
|
369 |
+
outputs=[results_html, detail_html]
|
370 |
+
)
|
371 |
+
|
372 |
+
return app
|
373 |
+
|
374 |
+
|
375 |
+
if __name__ == "__main__":
|
376 |
+
gr.close_all()
|
377 |
+
app = create_interface()
|
378 |
+
app.launch(
|
379 |
+
# server_name="localhost",
|
380 |
+
# server_port=7860,
|
381 |
+
# share=True
|
382 |
+
)
|