File size: 1,364 Bytes
2700879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from CrawDag.crawling.Crawler import Crawler
from CrawDag.models import News
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import requests
import pytz
import html

class VnexpressCrawler(Crawler):
    def __init__(self, topics: dict[str: str]) -> None:
        super().__init__(topics)


    def crawl(self) -> list[News]:
        news = []
        for topic in self.topics:
            response = requests.get(self.topics[topic], verify=False)
            soup = BeautifulSoup(response.content, 'xml')
            time = datetime.now(pytz.timezone('Asia/Ho_Chi_Minh')) - timedelta(hours=24)
            for item in soup.find_all('item'):
                link = item.find('link').text
                pub_date_text = item.find('pubDate').text
                date = datetime.strptime(pub_date_text, '%a, %d %b %Y %H:%M:%S %z')
                title = item.find('title').text.strip()
                title = html.unescape(title)
                description = item.find('description').text
                description_soup = BeautifulSoup(description, 'html.parser')
                img_tag = description_soup.find('img')
                image = img_tag['src'] if img_tag else None
                if date >= time:
                    news.append(News(topic=topic, title=title, link=link, date=date, image=image))
        return news