v1shal's picture
first_commit
b396e94
import nest_asyncio
from typing import List, Dict
from duckduckgo_search import DDGS
from phi.tools.newspaper4k import Newspaper4k
import time
nest_asyncio.apply()
def extract_news(article_topic: str, num_search_results: int = 15, max_retries: int = 3) -> List[Dict[str, str]]:
"""
Extracts full news articles based on the given topic and number of search results.
Args:
article_topic: The topic to search for.
num_search_results: The number of search results to retrieve.
max_retries: The maximum number of retries if an article fails to scrape.
Returns:
A list of dictionaries, where each dictionary represents a news article.
"""
news_results = []
ddgs = DDGS()
newspaper_tools = Newspaper4k()
results = ddgs.news(keywords=article_topic, max_results=num_search_results) # Fetch extra results
for r in results:
if "url" in r:
retries = 0
while retries < max_retries:
try:
article_data = newspaper_tools.get_article_data(r["url"])
if article_data and "text" in article_data and len(article_data["text"]) > 100:
news_results.append({
"title": r.get("title", "No Title"),
"text": article_data["text"] # Full article text
})
break # Successful extraction, break retry loop
else:
retries += 1
time.sleep(1) # Wait before retrying
except Exception as e:
retries += 1
time.sleep(1)
# Stop if we have collected enough articles
if len(news_results) >= num_search_results:
break
return news_results