Spaces:

Eurosmart
/

airflow-pipeline

Runtime error

2700879 4 months ago

865 Bytes

	from bs4 import BeautifulSoup
	import requests
	from CrawDag.models import News
	from newspaper import Article

	def clean_content(text: str) -> str:
	"""Helper function to clean article content."""
	return text.strip().replace("\n", " ").replace("\t", " ")

	def scrape_basic_article(news: News):
	response = requests.get(news.link, verify=False)
	soup = BeautifulSoup(response.content, 'html.parser')
	paragraphs = soup.find_all('p')
	content = ' '.join([para.get_text() for para in paragraphs[0:-1]])
	html = soup.find('article')
	news.content = clean_content(content)
	news.html = html

	def scrape_news_v2(news: News):
	article = Article(news.link)
	article.download()
	article.parse()
	paragraphs = article.text.split('\n')
	news.content = '\n'.join([para for para in paragraphs[0:-1]]).strip()
	news.html = article.html