Spaces:
No application file
No application file
File size: 1,900 Bytes
b396e94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import nest_asyncio
from typing import List, Dict
from duckduckgo_search import DDGS
from phi.tools.newspaper4k import Newspaper4k
import time
nest_asyncio.apply()
def extract_news(article_topic: str, num_search_results: int = 15, max_retries: int = 3) -> List[Dict[str, str]]:
"""
Extracts full news articles based on the given topic and number of search results.
Args:
article_topic: The topic to search for.
num_search_results: The number of search results to retrieve.
max_retries: The maximum number of retries if an article fails to scrape.
Returns:
A list of dictionaries, where each dictionary represents a news article.
"""
news_results = []
ddgs = DDGS()
newspaper_tools = Newspaper4k()
results = ddgs.news(keywords=article_topic, max_results=num_search_results) # Fetch extra results
for r in results:
if "url" in r:
retries = 0
while retries < max_retries:
try:
article_data = newspaper_tools.get_article_data(r["url"])
if article_data and "text" in article_data and len(article_data["text"]) > 100:
news_results.append({
"title": r.get("title", "No Title"),
"text": article_data["text"] # Full article text
})
break # Successful extraction, break retry loop
else:
retries += 1
time.sleep(1) # Wait before retrying
except Exception as e:
retries += 1
time.sleep(1)
# Stop if we have collected enough articles
if len(news_results) >= num_search_results:
break
return news_results
|