Spaces:

Eurosmart
/

airflow-pipeline

Runtime error

File size: 774 Bytes
from CrawDag.scraping.Scraper import Scraper
from CrawDag.models import News
from bs4 import BeautifulSoup
import requests

class ScrapeBasic(Scraper):
    def __init__(self, listNews: list[News]) -> None:
        self.listNews = listNews

    def scrape(self) -> list[News]:
        newsList: list[News] = []
        for news in self.listNews:
            response = requests.get(news.link, verify=False)
            soup = BeautifulSoup(response.content, 'html.parser')
            paragraphs = soup.find_all('p')
            content = ' '.join([para.get_text() for para in paragraphs[0:-1]])
            html = soup.find('article')
            news.content = content.strip()
            news.html = html
            newsList.append(news)

        return newsList