Spaces:
No application file
No application file
import nest_asyncio | |
from typing import List, Dict | |
from duckduckgo_search import DDGS | |
from phi.tools.newspaper4k import Newspaper4k | |
import time | |
nest_asyncio.apply() | |
def extract_news(article_topic: str, num_search_results: int = 15, max_retries: int = 3) -> List[Dict[str, str]]: | |
""" | |
Extracts full news articles based on the given topic and number of search results. | |
Args: | |
article_topic: The topic to search for. | |
num_search_results: The number of search results to retrieve. | |
max_retries: The maximum number of retries if an article fails to scrape. | |
Returns: | |
A list of dictionaries, where each dictionary represents a news article. | |
""" | |
news_results = [] | |
ddgs = DDGS() | |
newspaper_tools = Newspaper4k() | |
results = ddgs.news(keywords=article_topic, max_results=num_search_results) # Fetch extra results | |
for r in results: | |
if "url" in r: | |
retries = 0 | |
while retries < max_retries: | |
try: | |
article_data = newspaper_tools.get_article_data(r["url"]) | |
if article_data and "text" in article_data and len(article_data["text"]) > 100: | |
news_results.append({ | |
"title": r.get("title", "No Title"), | |
"text": article_data["text"] # Full article text | |
}) | |
break # Successful extraction, break retry loop | |
else: | |
retries += 1 | |
time.sleep(1) # Wait before retrying | |
except Exception as e: | |
retries += 1 | |
time.sleep(1) | |
# Stop if we have collected enough articles | |
if len(news_results) >= num_search_results: | |
break | |
return news_results | |