File size: 3,660 Bytes
7516245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from datetime import datetime
import re
import json
from langchain_core.messages import HumanMessage, SystemMessage
from LLMS import get_llm
from tools.search import get_search_tool
from tools.language_detector import is_english
from state import NewsState

def search_ai_news(state: NewsState):
    """Search for the latest AI news using Tavily"""
    search_tool = get_search_tool()
    
    # Format today's date
    today = state.get("date", datetime.now().strftime("%Y-%m-%d"))
    
    # Create search query with date to get recent news
    query = f"latest artificial intelligence news {today} english"
    
    # Execute search
    search_results = search_tool.invoke({"query": query})
    
    # Filter out YouTube results and non-English content
    filtered_results = []
    for result in search_results:
        if "youtube.com" not in result.get("url", "").lower():
            # Check if content is in English
            content = result.get("content", "") + " " + result.get("title", "")
            if is_english(content):
                filtered_results.append(result)
    
    return {"search_results": filtered_results}

def parse_news_items(state: NewsState):
    """Parse search results into structured news items using a more robust approach"""
    search_results = state["search_results"]
    
    # Format results for the LLM
    formatted_results = "\n\n".join([
        f"Title: {result.get('title', 'No title')}\n"
        f"URL: {result.get('url', 'No URL')}\n"
        f"Content: {result.get('content', 'No content')}"
        for result in search_results
    ])
    
    # Use a direct prompt instead of structured output
    system_prompt = """
    Extract AI news articles from these search results. Filter out any that aren't about artificial intelligence.
    
    For each relevant AI news article, provide:
    - title: The title of the article
    - url: The URL of the article
    - source: The source website of the news
    - description: A brief description of the article
    
    Format your response as a JSON list of objects. Only include the relevant fields, nothing else.
    Example format:
    [
      {
        "title": "New AI Development",
        "url": "https://example.com/news/ai-dev",
        "source": "Example News",
        "description": "Description of the AI development"
      }
    ]
    """
    
    # Get the response as a string
    llm = get_llm()
    response = llm.invoke([
        SystemMessage(content=system_prompt),
        HumanMessage(content=f"Here are the search results:\n\n{formatted_results}")
    ])
    
    # Extract the JSON part from the response
    response_text = response.content
    
    # Find JSON list in the response
    json_match = re.search(r'\[\s*\{.*\}\s*\]', response_text, re.DOTALL)
    
    news_items = []
    if json_match:
        try:
            # Parse the JSON text
            news_items = json.loads(json_match.group(0))
        except json.JSONDecodeError:
            # Fallback: create a simple item if JSON parsing fails
            news_items = [{
                "title": "AI News Roundup",
                "url": "https://example.com/ai-news",
                "source": "Various Sources",
                "description": "Compilation of latest AI news from various sources."
            }]
    else:
        # Create a default item if no JSON found
        news_items = [{
            "title": "AI News Roundup",
            "url": "https://example.com/ai-news",
            "source": "Various Sources",
            "description": "Compilation of latest AI news from various sources."
        }]
    
    return {"news_items": news_items}