File size: 3,977 Bytes
df5beb9
 
188ab1e
 
 
 
 
ccb6afe
 
 
188ab1e
df5beb9
 
188ab1e
 
 
 
3158d32
188ab1e
 
 
 
 
3158d32
188ab1e
3158d32
 
188ab1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3158d32
188ab1e
3158d32
188ab1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ccb6afe
188ab1e
 
 
 
ccb6afe
df5beb9
188ab1e
ccb6afe
 
188ab1e
ccb6afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188ab1e
df5beb9
188ab1e
 
 
 
ccb6afe
 
ac02643
df5beb9
 
 
 
f432512
ac02643
df5beb9
d91c7bd
df5beb9
ac02643
d91c7bd
 
df5beb9
ac02643
188ab1e
 
 
 
ccb6afe
 
188ab1e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from parser import parse_article, Article
from ai.classify_paper import classify_papers
import os
import requests
import datetime
import hashlib
import json
from rich import print
from date import Date
from concurrent.futures import ThreadPoolExecutor, as_completed

from typing import List, Dict


API_URL = "https://huggingface.co/api/daily_papers"

cache = {}
cache_expiry = {}


def make_request(url: str):
    # Create a hash of the URL to use as the cache key
    url_hash = hashlib.md5(url.encode()).hexdigest()
    current_time = datetime.datetime.now()

    # Check if the response is already cached and not expired
    if url_hash in cache and (current_time - cache_expiry[url_hash]).seconds < 3600:
        print(f"Cache hit for URL: {url}")
        return cache[url_hash]

    http_proxy = os.getenv("HF_HTTP_PROXY")
    https_proxy = os.getenv("HF_HTTPS_PROXY")
    proxies = {
        "http": http_proxy,
        "https": https_proxy
    } if http_proxy or https_proxy else None

    attempts = 0
    while attempts < 3:
        try:
            response = requests.get(url, proxies=proxies)
            response.raise_for_status()
            data = response.json()

            # Cache the response and set the expiry time
            cache[url_hash] = data
            cache_expiry[url_hash] = current_time

            return data
        except requests.RequestException as e:
            attempts += 1
            print(f"Attempt {attempts} failed: {e}")
            if attempts == 3:
                return []


def fetch_papers():
    data = make_request(API_URL)
    return [parse_article(item) for item in data]


def fetch_papers_with_date(date: datetime):
    formatted_date = str(date)
    data = make_request(API_URL + "?date=" + formatted_date)
    return [parse_article(item) for item in data]


def fetch_papers_with_daterange(start_date: Date, end_date: Date):
    articles: List[Article] = []
    current_date = start_date
    dates = []

    while current_date <= end_date:
        dates.append(current_date)
        current_date += 1

    def fetch_for_date(date):
        print(date)
        if date == Date():
            print("Fetching papers for today")
            return fetch_papers()
        else:
            print(f"Fetching papers for {date}")
            return fetch_papers_with_date(date)

    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_date = {executor.submit(fetch_for_date, date): date for date in dates}
        for future in as_completed(future_to_date):
            date = future_to_date[future]
            try:
                articles.extend(future.result())
            except Exception as e:
                print(f"Error fetching articles for date {date}: {e}")

    # articles = [article for article in articles if (start_date <= Date(article.publishedAt.isoformat().split('T')[0]) <= end_date)]

    unique_articles: Dict[str, Article] = {}
    for article in articles:
        if article.paper.id not in unique_articles:
            unique_articles[article.paper.id] = article

    print(f"Unique articles: {len(unique_articles)}")

    preprocessed_articles: List[Article] = list(unique_articles.values())

    preprocessed_articles = list(map(lambda article: {
        "title": article.title,
        "abstract": article.paper.summary,
        "id": article.paper.id
    }, preprocessed_articles))

    # classified_articles = classify_papers(preprocessed_articles)

    # 遍历 classified_articles,将分类结果写入到 unique_articles 中
    # for article in classified_articles:
    #     unique_articles[article["id"]].paper.label = article["category"]

    return list(unique_articles.values())


if __name__ == "__main__":
    from rich import print
    start_date = Date(2025, 1, 21)
    end_date = Date(2025, 2, 1)
    articles = fetch_papers_with_daterange(start_date=start_date, end_date=end_date)
    # print(articles)
    print(f"Total articles: {len(articles)}")