from datetime import datetime import re import json from langchain_core.messages import HumanMessage, SystemMessage from LLMS import get_llm from tools.search import get_search_tool from tools.language_detector import is_english from state import NewsState def search_ai_news(state: NewsState): """Search for the latest AI news using Tavily""" search_tool = get_search_tool() # Format today's date today = state.get("date", datetime.now().strftime("%Y-%m-%d")) # Create search query with date to get recent news query = f"latest artificial intelligence news {today} english" # Execute search search_results = search_tool.invoke({"query": query}) # Filter out YouTube results and non-English content filtered_results = [] for result in search_results: if "youtube.com" not in result.get("url", "").lower(): # Check if content is in English content = result.get("content", "") + " " + result.get("title", "") if is_english(content): filtered_results.append(result) return {"search_results": filtered_results} def parse_news_items(state: NewsState): """Parse search results into structured news items using a more robust approach""" search_results = state["search_results"] # Format results for the LLM formatted_results = "\n\n".join([ f"Title: {result.get('title', 'No title')}\n" f"URL: {result.get('url', 'No URL')}\n" f"Content: {result.get('content', 'No content')}" for result in search_results ]) # Use a direct prompt instead of structured output system_prompt = """ Extract AI news articles from these search results. Filter out any that aren't about artificial intelligence. For each relevant AI news article, provide: - title: The title of the article - url: The URL of the article - source: The source website of the news - description: A brief description of the article Format your response as a JSON list of objects. Only include the relevant fields, nothing else. Example format: [ { "title": "New AI Development", "url": "https://example.com/news/ai-dev", "source": "Example News", "description": "Description of the AI development" } ] """ # Get the response as a string llm = get_llm() response = llm.invoke([ SystemMessage(content=system_prompt), HumanMessage(content=f"Here are the search results:\n\n{formatted_results}") ]) # Extract the JSON part from the response response_text = response.content # Find JSON list in the response json_match = re.search(r'\[\s*\{.*\}\s*\]', response_text, re.DOTALL) news_items = [] if json_match: try: # Parse the JSON text news_items = json.loads(json_match.group(0)) except json.JSONDecodeError: # Fallback: create a simple item if JSON parsing fails news_items = [{ "title": "AI News Roundup", "url": "https://example.com/ai-news", "source": "Various Sources", "description": "Compilation of latest AI news from various sources." }] else: # Create a default item if no JSON found news_items = [{ "title": "AI News Roundup", "url": "https://example.com/ai-news", "source": "Various Sources", "description": "Compilation of latest AI news from various sources." }] return {"news_items": news_items}