File size: 3,977 Bytes
df5beb9 188ab1e ccb6afe 188ab1e df5beb9 188ab1e 3158d32 188ab1e 3158d32 188ab1e 3158d32 188ab1e 3158d32 188ab1e 3158d32 188ab1e ccb6afe 188ab1e ccb6afe df5beb9 188ab1e ccb6afe 188ab1e ccb6afe 188ab1e df5beb9 188ab1e ccb6afe ac02643 df5beb9 f432512 ac02643 df5beb9 d91c7bd df5beb9 ac02643 d91c7bd df5beb9 ac02643 188ab1e ccb6afe 188ab1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
from parser import parse_article, Article
from ai.classify_paper import classify_papers
import os
import requests
import datetime
import hashlib
import json
from rich import print
from date import Date
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict
API_URL = "https://huggingface.co/api/daily_papers"
cache = {}
cache_expiry = {}
def make_request(url: str):
# Create a hash of the URL to use as the cache key
url_hash = hashlib.md5(url.encode()).hexdigest()
current_time = datetime.datetime.now()
# Check if the response is already cached and not expired
if url_hash in cache and (current_time - cache_expiry[url_hash]).seconds < 3600:
print(f"Cache hit for URL: {url}")
return cache[url_hash]
http_proxy = os.getenv("HF_HTTP_PROXY")
https_proxy = os.getenv("HF_HTTPS_PROXY")
proxies = {
"http": http_proxy,
"https": https_proxy
} if http_proxy or https_proxy else None
attempts = 0
while attempts < 3:
try:
response = requests.get(url, proxies=proxies)
response.raise_for_status()
data = response.json()
# Cache the response and set the expiry time
cache[url_hash] = data
cache_expiry[url_hash] = current_time
return data
except requests.RequestException as e:
attempts += 1
print(f"Attempt {attempts} failed: {e}")
if attempts == 3:
return []
def fetch_papers():
data = make_request(API_URL)
return [parse_article(item) for item in data]
def fetch_papers_with_date(date: datetime):
formatted_date = str(date)
data = make_request(API_URL + "?date=" + formatted_date)
return [parse_article(item) for item in data]
def fetch_papers_with_daterange(start_date: Date, end_date: Date):
articles: List[Article] = []
current_date = start_date
dates = []
while current_date <= end_date:
dates.append(current_date)
current_date += 1
def fetch_for_date(date):
print(date)
if date == Date():
print("Fetching papers for today")
return fetch_papers()
else:
print(f"Fetching papers for {date}")
return fetch_papers_with_date(date)
with ThreadPoolExecutor(max_workers=8) as executor:
future_to_date = {executor.submit(fetch_for_date, date): date for date in dates}
for future in as_completed(future_to_date):
date = future_to_date[future]
try:
articles.extend(future.result())
except Exception as e:
print(f"Error fetching articles for date {date}: {e}")
# articles = [article for article in articles if (start_date <= Date(article.publishedAt.isoformat().split('T')[0]) <= end_date)]
unique_articles: Dict[str, Article] = {}
for article in articles:
if article.paper.id not in unique_articles:
unique_articles[article.paper.id] = article
print(f"Unique articles: {len(unique_articles)}")
preprocessed_articles: List[Article] = list(unique_articles.values())
preprocessed_articles = list(map(lambda article: {
"title": article.title,
"abstract": article.paper.summary,
"id": article.paper.id
}, preprocessed_articles))
# classified_articles = classify_papers(preprocessed_articles)
# 遍历 classified_articles,将分类结果写入到 unique_articles 中
# for article in classified_articles:
# unique_articles[article["id"]].paper.label = article["category"]
return list(unique_articles.values())
if __name__ == "__main__":
from rich import print
start_date = Date(2025, 1, 21)
end_date = Date(2025, 2, 1)
articles = fetch_papers_with_daterange(start_date=start_date, end_date=end_date)
# print(articles)
print(f"Total articles: {len(articles)}")
|