File size: 4,632 Bytes
20366da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c17be4
20366da
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import random
import feedparser
from dataclasses import dataclass
from typing import List, Optional, Dict, Set
from abc import ABC, abstractmethod

@dataclass
class Article:
    title: str
    link: str
    summary: str
    published: str
    authors: str
    pdf_link: Optional[str] = None
    source: str = None

class BaseRSSParser(ABC):
    @abstractmethod
    def parse_feed(self, feed: feedparser.FeedParserDict) -> List[Article]:
        """Парсит RSS-ленту и возвращает список объектов Article."""
        pass

class ArxivRSSParser(BaseRSSParser):
    def parse_feed(self, feed: feedparser.FeedParserDict) -> List[Article]:
        articles: List[Article] = []
        for entry in feed.entries:
            try:
                title: str = entry.get('title', 'Без названия')
                link: str = entry.get('link', '')
                summary: str = entry.get('summary', '')
                published: str = entry.get('published', 'Неизвестно')
                authors_list = entry.get('authors', [])
                authors: str = ', '.join([author.name for author in authors_list]) if authors_list else 'Неизвестно'
                pdf_link: Optional[str] = next(
                    (l.href for l in entry.get('links', []) if l.type == 'application/pdf'), None
                )

                article = Article(
                    title=title,
                    link=link,
                    summary=summary,
                    published=published,
                    authors=authors,
                    pdf_link=pdf_link
                )
                articles.append(article)
            except Exception as e:
                print(f"Ошибка при парсинге записи: {e}")
        return articles

## Пример другого парсера для другого RSS-источника
class DailyHFRSSParser(BaseRSSParser):
    def parse_feed(self, feed: feedparser.FeedParserDict) -> List[Article]:
        # Реализуйте специфическую логику парсинга для другого источника
        articles: List[Article] = []
        for entry in feed.entries:
            # Пример парсинга, замените на актуальные поля
            title: str = entry.get('title', 'Без названия')
            link: str = entry.get('link', '')
            summary: str = entry.get('description', '')
            published: str = entry.get('pubDate', 'Неизвестно')
            authors: str = entry.get('author', 'Неизвестно')
            
            article = Article(
                title=title,
                link=link,
                summary=summary,
                published=published,
                authors=authors,
                source="Daily papers"
            )
            articles.append(article)
        return articles


class RSSFeedFetcher:
    def __init__(self, feed_url: str):
        self.feed_url = feed_url

    def fetch_feed(self) -> feedparser.FeedParserDict:
        """Загружает и парсит RSS-ленту."""
        try:
            feed = feedparser.parse(self.feed_url)
            if feed.bozo:
                raise ValueError(f"Ошибка при парсинге RSS-ленты: {feed.bozo_exception}")
            return feed
        except Exception as e:
            print(f"Ошибка при загрузке ленты: {e}")
            return feedparser.FeedParserDict()

class RSSFeedProcessor:
    def __init__(self):
        self.feed_parsers: Dict[str, BaseRSSParser] = {}
        self.feed_urls: Dict[str, str] = {}

    def register_feed(self, source_key: str, feed_url: str, parser: BaseRSSParser):
        self.feed_parsers[source_key] = parser
        self.feed_urls[source_key] = feed_url

    def get_latest_articles(self, sources: Set[str], count: int = 1) -> List[Article]:
        all_articles: List[List[Article]] = []
        for source_key in sources:
            parser = self.feed_parsers.get(source_key)
            feed_url = self.feed_urls.get(source_key)
            if parser and feed_url:
                fetcher = RSSFeedFetcher(feed_url)
                feed = fetcher.fetch_feed()
                articles = parser.parse_feed(feed)[::-1]
                all_articles.append(articles[:count])
            else:
                print(f"Источник {source_key} не найден или не имеет парсера")

 
        random.shuffle(all_articles)
        
  
        return all_articles[:count*len(sources)]