File size: 1,900 Bytes
b396e94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import nest_asyncio
from typing import List, Dict
from duckduckgo_search import DDGS
from phi.tools.newspaper4k import Newspaper4k
import time

nest_asyncio.apply()

def extract_news(article_topic: str, num_search_results: int = 15, max_retries: int = 3) -> List[Dict[str, str]]:
    """
    Extracts full news articles based on the given topic and number of search results.

    Args:
        article_topic: The topic to search for.
        num_search_results: The number of search results to retrieve.
        max_retries: The maximum number of retries if an article fails to scrape.

    Returns:
        A list of dictionaries, where each dictionary represents a news article.
    """
    news_results = []
    ddgs = DDGS()
    newspaper_tools = Newspaper4k()
    
    results = ddgs.news(keywords=article_topic, max_results=num_search_results)  # Fetch extra results
    
    for r in results:
        if "url" in r:
            retries = 0
            while retries < max_retries:
                try:
                    article_data = newspaper_tools.get_article_data(r["url"])
                    
                    if article_data and "text" in article_data and len(article_data["text"]) > 100:
                        news_results.append({
                            "title": r.get("title", "No Title"),
                            "text": article_data["text"]  # Full article text
                        })
                        break  # Successful extraction, break retry loop
                    else:
                        retries += 1
                        time.sleep(1)  # Wait before retrying
                except Exception as e:
                    retries += 1
                    time.sleep(1)
                    
        # Stop if we have collected enough articles
        if len(news_results) >= num_search_results:
            break
    
    return news_results