|
from datetime import datetime |
|
import re |
|
import json |
|
from langchain_core.messages import HumanMessage, SystemMessage |
|
from LLMS import get_llm |
|
from tools.search import get_search_tool |
|
from tools.language_detector import is_english |
|
from state import NewsState |
|
|
|
def search_ai_news(state: NewsState): |
|
"""Search for the latest AI news using Tavily""" |
|
search_tool = get_search_tool() |
|
|
|
|
|
today = state.get("date", datetime.now().strftime("%Y-%m-%d")) |
|
|
|
|
|
query = f"latest artificial intelligence news {today} english" |
|
|
|
|
|
search_results = search_tool.invoke({"query": query}) |
|
|
|
|
|
filtered_results = [] |
|
for result in search_results: |
|
if "youtube.com" not in result.get("url", "").lower(): |
|
|
|
content = result.get("content", "") + " " + result.get("title", "") |
|
if is_english(content): |
|
filtered_results.append(result) |
|
|
|
return {"search_results": filtered_results} |
|
|
|
def parse_news_items(state: NewsState): |
|
"""Parse search results into structured news items using a more robust approach""" |
|
search_results = state["search_results"] |
|
|
|
|
|
formatted_results = "\n\n".join([ |
|
f"Title: {result.get('title', 'No title')}\n" |
|
f"URL: {result.get('url', 'No URL')}\n" |
|
f"Content: {result.get('content', 'No content')}" |
|
for result in search_results |
|
]) |
|
|
|
|
|
system_prompt = """ |
|
Extract AI news articles from these search results. Filter out any that aren't about artificial intelligence. |
|
|
|
For each relevant AI news article, provide: |
|
- title: The title of the article |
|
- url: The URL of the article |
|
- source: The source website of the news |
|
- description: A brief description of the article |
|
|
|
Format your response as a JSON list of objects. Only include the relevant fields, nothing else. |
|
Example format: |
|
[ |
|
{ |
|
"title": "New AI Development", |
|
"url": "https://example.com/news/ai-dev", |
|
"source": "Example News", |
|
"description": "Description of the AI development" |
|
} |
|
] |
|
""" |
|
|
|
|
|
llm = get_llm() |
|
response = llm.invoke([ |
|
SystemMessage(content=system_prompt), |
|
HumanMessage(content=f"Here are the search results:\n\n{formatted_results}") |
|
]) |
|
|
|
|
|
response_text = response.content |
|
|
|
|
|
json_match = re.search(r'\[\s*\{.*\}\s*\]', response_text, re.DOTALL) |
|
|
|
news_items = [] |
|
if json_match: |
|
try: |
|
|
|
news_items = json.loads(json_match.group(0)) |
|
except json.JSONDecodeError: |
|
|
|
news_items = [{ |
|
"title": "AI News Roundup", |
|
"url": "https://example.com/ai-news", |
|
"source": "Various Sources", |
|
"description": "Compilation of latest AI news from various sources." |
|
}] |
|
else: |
|
|
|
news_items = [{ |
|
"title": "AI News Roundup", |
|
"url": "https://example.com/ai-news", |
|
"source": "Various Sources", |
|
"description": "Compilation of latest AI news from various sources." |
|
}] |
|
|
|
return {"news_items": news_items} |