Spaces:

v1shal
/

News_Summarisation_and_Sentiment_Analysis

Running

first_commit

b396e94 4 months ago

1.9 kB

	import nest_asyncio
	from typing import List, Dict
	from duckduckgo_search import DDGS
	from phi.tools.newspaper4k import Newspaper4k
	import time

	nest_asyncio.apply()

	def extract_news(article_topic: str, num_search_results: int = 15, max_retries: int = 3) -> List[Dict[str, str]]:
	"""
	Extracts full news articles based on the given topic and number of search results.

	Args:
	article_topic: The topic to search for.
	num_search_results: The number of search results to retrieve.
	max_retries: The maximum number of retries if an article fails to scrape.

	Returns:
	A list of dictionaries, where each dictionary represents a news article.
	"""
	news_results = []
	ddgs = DDGS()
	newspaper_tools = Newspaper4k()

	results = ddgs.news(keywords=article_topic, max_results=num_search_results) # Fetch extra results

	for r in results:
	if "url" in r:
	retries = 0
	while retries < max_retries:
	try:
	article_data = newspaper_tools.get_article_data(r["url"])

	if article_data and "text" in article_data and len(article_data["text"]) > 100:
	news_results.append({
	"title": r.get("title", "No Title"),
	"text": article_data["text"] # Full article text
	})
	break # Successful extraction, break retry loop
	else:
	retries += 1
	time.sleep(1) # Wait before retrying
	except Exception as e:
	retries += 1
	time.sleep(1)

	# Stop if we have collected enough articles
	if len(news_results) >= num_search_results:
	break

	return news_results