AiNewsV2 / nodes /search_nodes.py
Phoenix21's picture
Modular code
7516245
from datetime import datetime
import re
import json
from langchain_core.messages import HumanMessage, SystemMessage
from LLMS import get_llm
from tools.search import get_search_tool
from tools.language_detector import is_english
from state import NewsState
def search_ai_news(state: NewsState):
"""Search for the latest AI news using Tavily"""
search_tool = get_search_tool()
# Format today's date
today = state.get("date", datetime.now().strftime("%Y-%m-%d"))
# Create search query with date to get recent news
query = f"latest artificial intelligence news {today} english"
# Execute search
search_results = search_tool.invoke({"query": query})
# Filter out YouTube results and non-English content
filtered_results = []
for result in search_results:
if "youtube.com" not in result.get("url", "").lower():
# Check if content is in English
content = result.get("content", "") + " " + result.get("title", "")
if is_english(content):
filtered_results.append(result)
return {"search_results": filtered_results}
def parse_news_items(state: NewsState):
"""Parse search results into structured news items using a more robust approach"""
search_results = state["search_results"]
# Format results for the LLM
formatted_results = "\n\n".join([
f"Title: {result.get('title', 'No title')}\n"
f"URL: {result.get('url', 'No URL')}\n"
f"Content: {result.get('content', 'No content')}"
for result in search_results
])
# Use a direct prompt instead of structured output
system_prompt = """
Extract AI news articles from these search results. Filter out any that aren't about artificial intelligence.
For each relevant AI news article, provide:
- title: The title of the article
- url: The URL of the article
- source: The source website of the news
- description: A brief description of the article
Format your response as a JSON list of objects. Only include the relevant fields, nothing else.
Example format:
[
{
"title": "New AI Development",
"url": "https://example.com/news/ai-dev",
"source": "Example News",
"description": "Description of the AI development"
}
]
"""
# Get the response as a string
llm = get_llm()
response = llm.invoke([
SystemMessage(content=system_prompt),
HumanMessage(content=f"Here are the search results:\n\n{formatted_results}")
])
# Extract the JSON part from the response
response_text = response.content
# Find JSON list in the response
json_match = re.search(r'\[\s*\{.*\}\s*\]', response_text, re.DOTALL)
news_items = []
if json_match:
try:
# Parse the JSON text
news_items = json.loads(json_match.group(0))
except json.JSONDecodeError:
# Fallback: create a simple item if JSON parsing fails
news_items = [{
"title": "AI News Roundup",
"url": "https://example.com/ai-news",
"source": "Various Sources",
"description": "Compilation of latest AI news from various sources."
}]
else:
# Create a default item if no JSON found
news_items = [{
"title": "AI News Roundup",
"url": "https://example.com/ai-news",
"source": "Various Sources",
"description": "Compilation of latest AI news from various sources."
}]
return {"news_items": news_items}