Spaces:

Phoenix21
/

AiNewsV2

Sleeping

App Files Files Community

AiNewsV2 / nodes /search_nodes.py

Phoenix21

Modular code

7516245 4 months ago

raw

history blame contribute delete

3.66 kB

	from datetime import datetime
	import re
	import json
	from langchain_core.messages import HumanMessage, SystemMessage
	from LLMS import get_llm
	from tools.search import get_search_tool
	from tools.language_detector import is_english
	from state import NewsState

	def search_ai_news(state: NewsState):
	"""Search for the latest AI news using Tavily"""
	search_tool = get_search_tool()

	# Format today's date
	today = state.get("date", datetime.now().strftime("%Y-%m-%d"))

	# Create search query with date to get recent news
	query = f"latest artificial intelligence news {today} english"

	# Execute search
	search_results = search_tool.invoke({"query": query})

	# Filter out YouTube results and non-English content
	filtered_results = []
	for result in search_results:
	if "youtube.com" not in result.get("url", "").lower():
	# Check if content is in English
	content = result.get("content", "") + " " + result.get("title", "")
	if is_english(content):
	filtered_results.append(result)

	return {"search_results": filtered_results}

	def parse_news_items(state: NewsState):
	"""Parse search results into structured news items using a more robust approach"""
	search_results = state["search_results"]

	# Format results for the LLM
	formatted_results = "\n\n".join([
	f"Title: {result.get('title', 'No title')}\n"
	f"URL: {result.get('url', 'No URL')}\n"
	f"Content: {result.get('content', 'No content')}"
	for result in search_results
	])

	# Use a direct prompt instead of structured output
	system_prompt = """
	Extract AI news articles from these search results. Filter out any that aren't about artificial intelligence.

	For each relevant AI news article, provide:
	- title: The title of the article
	- url: The URL of the article
	- source: The source website of the news
	- description: A brief description of the article

	Format your response as a JSON list of objects. Only include the relevant fields, nothing else.
	Example format:
	[
	{
	"title": "New AI Development",
	"url": "https://example.com/news/ai-dev",
	"source": "Example News",
	"description": "Description of the AI development"
	}
	]
	"""

	# Get the response as a string
	llm = get_llm()
	response = llm.invoke([
	SystemMessage(content=system_prompt),
	HumanMessage(content=f"Here are the search results:\n\n{formatted_results}")
	])

	# Extract the JSON part from the response
	response_text = response.content

	# Find JSON list in the response
	json_match = re.search(r'\[\s\{.\}\s*\]', response_text, re.DOTALL)

	news_items = []
	if json_match:
	try:
	# Parse the JSON text
	news_items = json.loads(json_match.group(0))
	except json.JSONDecodeError:
	# Fallback: create a simple item if JSON parsing fails
	news_items = [{
	"title": "AI News Roundup",
	"url": "https://example.com/ai-news",
	"source": "Various Sources",
	"description": "Compilation of latest AI news from various sources."
	}]
	else:
	# Create a default item if no JSON found
	news_items = [{
	"title": "AI News Roundup",
	"url": "https://example.com/ai-news",
	"source": "Various Sources",
	"description": "Compilation of latest AI news from various sources."
	}]

	return {"news_items": news_items}