Phoenix21 commited on
Commit
7516245
·
1 Parent(s): 225af9f

Modular code

Browse files
.DS_Store ADDED
Binary file (8.2 kB). View file
 
LLMS/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .llm_init import initialize_llm, get_llm, get_planner
2
+ from .models import NewsItem, Subsection, Section, Sections
LLMS/llm_init.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_groq import ChatGroq
2
+ import os
3
+ from .models import Sections
4
+
5
+ # Global LLM instance
6
+ llm = None
7
+
8
+ def initialize_llm(groq_api_key=None):
9
+ """
10
+ Initialize the LLM with optional API key
11
+
12
+ Args:
13
+ groq_api_key (str, optional): Groq API key to use
14
+ """
15
+ global llm
16
+
17
+ # Set API key if provided
18
+ if groq_api_key:
19
+ os.environ["GROQ_API_KEY"] = groq_api_key
20
+
21
+ # Initialize LLM
22
+ llm = ChatGroq(model="qwen-2.5-32b")
23
+
24
+ return llm
25
+
26
+ def get_llm():
27
+ """
28
+ Get the global LLM instance, initializing if needed
29
+
30
+ Returns:
31
+ ChatGroq: The LLM instance
32
+ """
33
+ global llm
34
+
35
+ if llm is None:
36
+ llm = initialize_llm()
37
+
38
+ return llm
39
+
40
+ def get_planner():
41
+ """
42
+ Get the planner with structured output capabilities
43
+
44
+ Returns:
45
+ Structured LLM: LLM with structured output for planning
46
+ """
47
+ # Set up structured output model for planner
48
+ return get_llm().with_structured_output(Sections)
LLMS/models.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from pydantic import BaseModel, Field
3
+
4
+ class NewsItem(BaseModel):
5
+ title: str = Field(description="Title of the AI news article")
6
+ url: str = Field(description="URL of the news article")
7
+ source: str = Field(description="Source website of the news")
8
+ description: str = Field(description="Brief description of the news article")
9
+
10
+ class NewsResults(BaseModel):
11
+ news_items: List[NewsItem] = Field(description="List of AI news articles found")
12
+
13
+ class Subsection(BaseModel):
14
+ title: str = Field(description="Title of the subsection (based on news item title)")
15
+ source: str = Field(description="Source of the news item")
16
+ url: str = Field(description="URL of the news item")
17
+ content: str = Field(description="Content for this subsection")
18
+
19
+ class Section(BaseModel):
20
+ name: str = Field(description="Name for this section of the blog")
21
+ description: str = Field(description="Description for this section of the blog")
22
+ information: str = Field(description="Information which should be included in this section of the blog")
23
+ subsections: List[Subsection] = Field(description="Subsections for each news item in this category", default=[])
24
+
25
+ class Sections(BaseModel):
26
+ sections: List[Section] = Field(description="List of sections for this blog")
__init__ ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Root package initialization
2
+ from LLMS import initialize_llm
3
+ from core import generate_ai_news_blog
4
+ from ui import display_workflow, display_blog, launch_ui
core.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import os
3
+ from LLMS import initialize_llm
4
+ from graph import (
5
+ create_news_search_workflow,
6
+ create_article_scraper_workflow,
7
+ create_blog_generator_workflow
8
+ )
9
+
10
+ def generate_ai_news_blog(groq_api_key=None, tavily_api_key=None, date=None):
11
+ """
12
+ Main function to generate AI news blog
13
+
14
+ Args:
15
+ groq_api_key (str, optional): Groq API key
16
+ tavily_api_key (str, optional): Tavily API key
17
+ date (str, optional): Date to search for news (YYYY-MM-DD format)
18
+
19
+ Returns:
20
+ str: Generated blog content in markdown format
21
+ """
22
+ # Set API keys if provided
23
+ if groq_api_key:
24
+ os.environ["GROQ_API_KEY"] = groq_api_key
25
+ if tavily_api_key:
26
+ os.environ["TAVILY_API_KEY"] = tavily_api_key
27
+
28
+ # Initialize LLM with the API key
29
+ initialize_llm(groq_api_key)
30
+
31
+ # Get date
32
+ if not date:
33
+ today = datetime.now().strftime("%Y-%m-%d")
34
+ else:
35
+ today = date
36
+
37
+ # Step 1: Search for AI news
38
+ news_search = create_news_search_workflow()
39
+ news_results = news_search.invoke({"query": "latest artificial intelligence news", "date": today})
40
+
41
+ print(f"Found {len(news_results['news_items'])} AI news items")
42
+
43
+ # Step 2: Scrape content for each news item
44
+ article_scraper = create_article_scraper_workflow()
45
+ news_contents = []
46
+
47
+ for item in news_results["news_items"]:
48
+ print(f"Scraping: {item['title']} from {item['source']}")
49
+ result = article_scraper.invoke({"url": item['url']})
50
+
51
+ # Skip if not in English
52
+ if "not in English" in result["article_content"]:
53
+ print(f"Skipping non-English content: {item['title']}")
54
+ continue
55
+
56
+ news_contents.append({
57
+ "title": item['title'],
58
+ "url": item['url'],
59
+ "source": item['source'],
60
+ "description": item['description'],
61
+ "content": result["article_content"]
62
+ })
63
+
64
+ # Check if we have any news items
65
+ if not news_contents:
66
+ return "No English language AI news items found for the specified date. Please try a different date."
67
+
68
+ # Format news content for the blog generator
69
+ formatted_content = "\n\n".join([
70
+ f"TITLE: {item['title']}\nSOURCE: {item['source']}\nURL: {item['url']}\nDESCRIPTION: {item['description']}\nCONTENT: {item['content'][:2000]}..."
71
+ for item in news_contents
72
+ ])
73
+
74
+ # Step 3: Generate the blog
75
+ blog_generator = create_blog_generator_workflow()
76
+ blog_result = blog_generator.invoke({
77
+ "content": formatted_content,
78
+ "completed_sections": []
79
+ })
80
+
81
+ return blog_result["final_report"]
graph/__init_.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .workflows import (
2
+ create_news_search_workflow,
3
+ create_article_scraper_workflow,
4
+ create_blog_generator_workflow
5
+ )
graph/workflows.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langgraph.graph import StateGraph, START, END
2
+ from state import NewsState, ArticleScraperState, BlogState
3
+ from nodes import (
4
+ search_ai_news,
5
+ parse_news_items,
6
+ scrape_article_content,
7
+ orchestrator,
8
+ llm_call,
9
+ synthesizer,
10
+ assign_workers
11
+ )
12
+
13
+ def create_news_search_workflow():
14
+ """Create a workflow for searching and parsing AI news"""
15
+ workflow = StateGraph(NewsState)
16
+
17
+ # Add nodes
18
+ workflow.add_node("search_ai_news", search_ai_news)
19
+ workflow.add_node("parse_news_items", parse_news_items)
20
+
21
+ # Add edges
22
+ workflow.add_edge(START, "search_ai_news")
23
+ workflow.add_edge("search_ai_news", "parse_news_items")
24
+ workflow.add_edge("parse_news_items", END)
25
+
26
+ return workflow.compile()
27
+
28
+ def create_article_scraper_workflow():
29
+ """Create a workflow for scraping article content"""
30
+ workflow = StateGraph(ArticleScraperState)
31
+
32
+ # Add node
33
+ workflow.add_node("scrape_article", scrape_article_content)
34
+
35
+ # Add edges
36
+ workflow.add_edge(START, "scrape_article")
37
+ workflow.add_edge("scrape_article", END)
38
+
39
+ return workflow.compile()
40
+
41
+ def create_blog_generator_workflow():
42
+ """Create a workflow for generating the blog"""
43
+ workflow = StateGraph(BlogState)
44
+
45
+ # Add nodes
46
+ workflow.add_node("orchestrator", orchestrator)
47
+ workflow.add_node("llm_call", llm_call)
48
+ workflow.add_node("synthesizer", synthesizer)
49
+
50
+ # Add edges
51
+ workflow.add_edge(START, "orchestrator")
52
+ workflow.add_conditional_edges("orchestrator", assign_workers, ["llm_call"])
53
+ workflow.add_edge("llm_call", "synthesizer")
54
+ workflow.add_edge("synthesizer", END)
55
+
56
+ return workflow.compile()
main.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from datetime import datetime
3
+ import os
4
+ import getpass
5
+ import sys
6
+ from graph import create_blog_generator_workflow
7
+ from ui import display_workflow, display_blog, launch_ui
8
+ from core import generate_ai_news_blog
9
+
10
+ def main():
11
+ """Main entry point for the application"""
12
+ parser = argparse.ArgumentParser(description="AI News Blog Generator")
13
+ parser.add_argument("--ui", action="store_true", help="Launch the web UI")
14
+ parser.add_argument("--date", type=str, help="Date to search for news (YYYY-MM-DD format)")
15
+ parser.add_argument("--groq-key", type=str, help="Groq API key")
16
+ parser.add_argument("--tavily-key", type=str, help="Tavily API key")
17
+ parser.add_argument("--output", type=str, help="Output file path")
18
+
19
+ args = parser.parse_args()
20
+
21
+ # Launch the web UI if requested
22
+ if args.ui:
23
+ print("Launching web UI...")
24
+ launch_ui()
25
+ return
26
+
27
+ # Otherwise, run the CLI version
28
+ try:
29
+ # Get API keys if not provided
30
+ groq_key = args.groq_key
31
+ if not groq_key and not os.environ.get("GROQ_API_KEY"):
32
+ groq_key = getpass.getpass("Enter the Groq API key: ")
33
+
34
+ tavily_key = args.tavily_key
35
+ if not tavily_key and not os.environ.get("TAVILY_API_KEY"):
36
+ tavily_key = getpass.getpass("Enter the Tavily API key: ")
37
+
38
+ # Display the workflow graph
39
+ print("Generating AI News Blog...")
40
+
41
+ # Generate the blog
42
+ blog_content = generate_ai_news_blog(groq_key, tavily_key, args.date)
43
+
44
+ # Save to file if output path provided
45
+ if args.output:
46
+ with open(args.output, "w") as f:
47
+ f.write(blog_content)
48
+ print(f"Blog saved to {args.output}")
49
+ else:
50
+ # Otherwise, save to default file
51
+ output_file = f"ai_news_blog_{datetime.now().strftime('%Y-%m-%d')}.md"
52
+ with open(output_file, "w") as f:
53
+ f.write(blog_content)
54
+ print(f"Blog saved to {output_file}")
55
+
56
+ # Try to display the blog if running in a notebook
57
+ try:
58
+ display_blog(blog_content)
59
+ except:
60
+ print("Blog generated successfully.")
61
+
62
+ except KeyboardInterrupt:
63
+ print("\nOperation cancelled by user.")
64
+ sys.exit(0)
65
+ except Exception as e:
66
+ print(f"Error running the pipeline: {str(e)}")
67
+ sys.exit(1)
68
+
69
+ if __name__ == "__main__":
70
+ main()
nodes/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .search_nodes import search_ai_news, parse_news_items
2
+ from .scraper_nodes import scrape_article_content
3
+ from .blog_nodes import orchestrator, llm_call, synthesizer, assign_workers
nodes/blog_nodes.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import re
3
+ from langchain_core.messages import HumanMessage, SystemMessage
4
+ from langgraph.constants import Send
5
+ from LLMS import get_llm, get_planner
6
+ from LLMS.models import Section, Subsection
7
+ from state import BlogState, WorkerState
8
+
9
+ def orchestrator(state: BlogState):
10
+ """Orchestrator that generates a plan for the blog based on news items"""
11
+
12
+ try:
13
+ # Parse the content to extract news items
14
+ content_lines = state['content'].split('\n\n')
15
+ news_items = []
16
+ current_item = {}
17
+
18
+ for content_block in content_lines:
19
+ if content_block.startswith('TITLE:'):
20
+ # Start of a new item
21
+ if current_item and 'title' in current_item:
22
+ news_items.append(current_item)
23
+ current_item = {}
24
+
25
+ lines = content_block.split('\n')
26
+ for line in lines:
27
+ if line.startswith('TITLE:'):
28
+ current_item['title'] = line.replace('TITLE:', '').strip()
29
+ elif line.startswith('SOURCE:'):
30
+ current_item['source'] = line.replace('SOURCE:', '').strip()
31
+ elif line.startswith('URL:'):
32
+ current_item['url'] = line.replace('URL:', '').strip()
33
+ elif line.startswith('DESCRIPTION:'):
34
+ current_item['description'] = line.replace('DESCRIPTION:', '').strip()
35
+ elif line.startswith('CONTENT:'):
36
+ current_item['content'] = line.replace('CONTENT:', '').strip()
37
+ elif 'content' in current_item:
38
+ # Add to existing content
39
+ current_item['content'] += ' ' + content_block
40
+
41
+ # Add the last item
42
+ if current_item and 'title' in current_item:
43
+ news_items.append(current_item)
44
+
45
+ # Group news items by category
46
+ ai_tech_items = []
47
+ ai_business_items = []
48
+ ai_research_items = []
49
+
50
+ for item in news_items:
51
+ title = item.get('title', '').lower()
52
+ description = item.get('description', '').lower()
53
+
54
+ # Simple categorization based on keywords
55
+ if any(kw in title + description for kw in ['business', 'market', 'company', 'investment', 'startup']):
56
+ ai_business_items.append(item)
57
+ elif any(kw in title + description for kw in ['research', 'study', 'paper', 'university']):
58
+ ai_research_items.append(item)
59
+ else:
60
+ ai_tech_items.append(item)
61
+
62
+ # Create sections with subsections
63
+ sections = []
64
+
65
+ # AI Technology section
66
+ if ai_tech_items:
67
+ tech_subsections = [
68
+ Subsection(
69
+ title=item['title'],
70
+ source=item['source'],
71
+ url=item['url'],
72
+ content=f"{item.get('description', '')} {item.get('content', '')[:500]}..."
73
+ ) for item in ai_tech_items
74
+ ]
75
+
76
+ sections.append(Section(
77
+ name="AI Technology Developments",
78
+ description="Recent advancements in AI technology and applications",
79
+ information="Cover the latest developments in AI technology.",
80
+ subsections=tech_subsections
81
+ ))
82
+
83
+ # AI Business section
84
+ if ai_business_items:
85
+ business_subsections = [
86
+ Subsection(
87
+ title=item['title'],
88
+ source=item['source'],
89
+ url=item['url'],
90
+ content=f"{item.get('description', '')} {item.get('content', '')[:500]}..."
91
+ ) for item in ai_business_items
92
+ ]
93
+
94
+ sections.append(Section(
95
+ name="AI in Business",
96
+ description="How AI is transforming industries and markets",
97
+ information="Focus on business applications and market trends in AI.",
98
+ subsections=business_subsections
99
+ ))
100
+
101
+ # AI Research section
102
+ if ai_research_items:
103
+ research_subsections = [
104
+ Subsection(
105
+ title=item['title'],
106
+ source=item['source'],
107
+ url=item['url'],
108
+ content=f"{item.get('description', '')} {item.get('content', '')[:500]}..."
109
+ ) for item in ai_research_items
110
+ ]
111
+
112
+ sections.append(Section(
113
+ name="AI Research and Studies",
114
+ description="Latest research findings and academic work in AI",
115
+ information="Cover recent research papers and studies in AI.",
116
+ subsections=research_subsections
117
+ ))
118
+
119
+ # If no items were categorized, create a general section
120
+ if not sections:
121
+ general_subsections = [
122
+ Subsection(
123
+ title=item['title'],
124
+ source=item['source'],
125
+ url=item['url'],
126
+ content=f"{item.get('description', '')} {item.get('content', '')[:500]}..."
127
+ ) for item in news_items
128
+ ]
129
+
130
+ sections.append(Section(
131
+ name="Latest AI News",
132
+ description="Roundup of the latest AI news from around the web",
133
+ information="Cover a range of AI news topics.",
134
+ subsections=general_subsections
135
+ ))
136
+
137
+ return {"sections": sections}
138
+ except Exception as e:
139
+ print(f"Error in orchestrator: {str(e)}")
140
+ # Fallback plan if structured output fails
141
+ fallback_sections = [
142
+ Section(
143
+ name="Latest AI Developments",
144
+ description="Overview of recent AI advancements and research",
145
+ information="Summarize the latest AI developments from the provided content.",
146
+ subsections=[]
147
+ )
148
+ ]
149
+ return {"sections": fallback_sections}
150
+
151
+ def llm_call(state: WorkerState):
152
+ """Worker writes a section of the blog with subsections for each news item"""
153
+
154
+ section = state['section']
155
+
156
+ # Generate section header without ID for cleaner markdown
157
+ section_header = f"## {section.name}\n\n{section.description}\n"
158
+
159
+ # If there are subsections, process each one
160
+ subsections_content = ""
161
+ llm = get_llm()
162
+
163
+ if section.subsections:
164
+ for idx, subsection in enumerate(section.subsections):
165
+ # Generate subsection using LLM
166
+ subsection_prompt = f"""
167
+ Write a detailed subsection about this AI news item:
168
+ Title: {subsection.title}
169
+ Source: {subsection.source}
170
+ URL: {subsection.url}
171
+
172
+ Content to summarize and expand on:
173
+ {subsection.content}
174
+
175
+ Keep your response focused on the news item and make it engaging. Use markdown formatting.
176
+ """
177
+
178
+ subsection_content = llm.invoke([
179
+ SystemMessage(content="You are writing a subsection for an AI news blog. Write in a professional but engaging style. Include key details and insights. Use markdown formatting."),
180
+ HumanMessage(content=subsection_prompt)
181
+ ])
182
+
183
+ # Format subsection with title and source (without ID tags)
184
+ formatted_subsection = f"### {subsection.title}\n\n"
185
+ formatted_subsection += f"*Source: [{subsection.source}]({subsection.url})*\n\n"
186
+ formatted_subsection += subsection_content.content
187
+
188
+ subsections_content += formatted_subsection + "\n\n"
189
+ else:
190
+ # If no subsections, generate the full section content
191
+ section_content = llm.invoke([
192
+ SystemMessage(content="Write a blog section following the provided name, description, and information. Include no preamble. Use markdown formatting."),
193
+ HumanMessage(content=f"Here is the section name: {section.name}\nDescription: {section.description}\nInformation: {section.information}")
194
+ ])
195
+ subsections_content = section_content.content
196
+
197
+ # Combine section header and subsections
198
+ complete_section = section_header + subsections_content
199
+
200
+ # Return the completed section
201
+ return {"completed_sections": [complete_section]}
202
+
203
+ def synthesizer(state: BlogState):
204
+ """Synthesize full blog from sections with proper formatting and hierarchical TOC"""
205
+
206
+ # List of completed sections
207
+ completed_sections = state["completed_sections"]
208
+
209
+ # Format completed sections into a full blog post
210
+ completed_report = "\n\n".join(completed_sections)
211
+
212
+ # Add title, date, and introduction
213
+ today = datetime.now().strftime("%Y-%m-%d")
214
+ blog_title = f"# AI News Roundup - {today}"
215
+
216
+ # Generate a brief introduction
217
+ llm = get_llm()
218
+ intro = llm.invoke([
219
+ SystemMessage(content="Write a brief introduction for an AI news roundup blog post. Keep it under 100 words. Be engaging and professional."),
220
+ HumanMessage(content=f"Today's date is {today}. Write a brief introduction for an AI news roundup.")
221
+ ])
222
+
223
+ # Create hierarchical table of contents
224
+ table_of_contents = "## Table of Contents\n\n"
225
+
226
+ # Find all section headings (## headings)
227
+ section_matches = re.findall(r'## ([^\n]+)', completed_report)
228
+
229
+ for i, section_name in enumerate(section_matches, 1):
230
+ # Add section to TOC with auto-generated link
231
+ # Create a clean anchor from the section name
232
+ section_anchor = section_name.lower().replace(' ', '-')
233
+ table_of_contents += f"{i}. [{section_name}](#{section_anchor})\n"
234
+
235
+ # Find all subsections within this section
236
+ section_start = completed_report.find(f"## {section_name}")
237
+ next_section_match = re.search(r'## ', completed_report[section_start+1:])
238
+ if next_section_match:
239
+ section_end = section_start + 1 + next_section_match.start()
240
+ section_text = completed_report[section_start:section_end]
241
+ else:
242
+ section_text = completed_report[section_start:]
243
+
244
+ # Extract subsection headings
245
+ subsection_matches = re.findall(r'### ([^\n]+)', section_text)
246
+
247
+ for j, subsection_name in enumerate(subsection_matches, 1):
248
+ # Create a clean anchor from the subsection name
249
+ subsection_anchor = subsection_name.lower().replace(' ', '-').replace(':', '').replace('?', '').replace('!', '').replace('.', '')
250
+ # Add subsection to TOC with proper indentation
251
+ table_of_contents += f" {i}.{j}. [{subsection_name}](#{subsection_anchor})\n"
252
+
253
+ final_report = f"{blog_title}\n\n{intro.content}\n\n{table_of_contents}\n\n---\n\n{completed_report}\n\n---\n\n*This AI News Roundup was automatically generated on {today}.*"
254
+
255
+ return {"final_report": final_report}
256
+
257
+ def assign_workers(state: BlogState):
258
+ """Assign a worker to each section in the plan"""
259
+
260
+ # Kick off section writing in parallel
261
+ return [Send("llm_call", {"section": s}) for s in state["sections"]]
nodes/scraper_nodes.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from state import ArticleScraperState
2
+ from tools.scraper import scrape_article
3
+
4
+ def scrape_article_content(state: ArticleScraperState):
5
+ """Scrape the content from a news article URL"""
6
+ url = state["url"]
7
+ article_content = scrape_article(url)
8
+ return {"article_content": article_content}
nodes/search_nodes.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import re
3
+ import json
4
+ from langchain_core.messages import HumanMessage, SystemMessage
5
+ from LLMS import get_llm
6
+ from tools.search import get_search_tool
7
+ from tools.language_detector import is_english
8
+ from state import NewsState
9
+
10
+ def search_ai_news(state: NewsState):
11
+ """Search for the latest AI news using Tavily"""
12
+ search_tool = get_search_tool()
13
+
14
+ # Format today's date
15
+ today = state.get("date", datetime.now().strftime("%Y-%m-%d"))
16
+
17
+ # Create search query with date to get recent news
18
+ query = f"latest artificial intelligence news {today} english"
19
+
20
+ # Execute search
21
+ search_results = search_tool.invoke({"query": query})
22
+
23
+ # Filter out YouTube results and non-English content
24
+ filtered_results = []
25
+ for result in search_results:
26
+ if "youtube.com" not in result.get("url", "").lower():
27
+ # Check if content is in English
28
+ content = result.get("content", "") + " " + result.get("title", "")
29
+ if is_english(content):
30
+ filtered_results.append(result)
31
+
32
+ return {"search_results": filtered_results}
33
+
34
+ def parse_news_items(state: NewsState):
35
+ """Parse search results into structured news items using a more robust approach"""
36
+ search_results = state["search_results"]
37
+
38
+ # Format results for the LLM
39
+ formatted_results = "\n\n".join([
40
+ f"Title: {result.get('title', 'No title')}\n"
41
+ f"URL: {result.get('url', 'No URL')}\n"
42
+ f"Content: {result.get('content', 'No content')}"
43
+ for result in search_results
44
+ ])
45
+
46
+ # Use a direct prompt instead of structured output
47
+ system_prompt = """
48
+ Extract AI news articles from these search results. Filter out any that aren't about artificial intelligence.
49
+
50
+ For each relevant AI news article, provide:
51
+ - title: The title of the article
52
+ - url: The URL of the article
53
+ - source: The source website of the news
54
+ - description: A brief description of the article
55
+
56
+ Format your response as a JSON list of objects. Only include the relevant fields, nothing else.
57
+ Example format:
58
+ [
59
+ {
60
+ "title": "New AI Development",
61
+ "url": "https://example.com/news/ai-dev",
62
+ "source": "Example News",
63
+ "description": "Description of the AI development"
64
+ }
65
+ ]
66
+ """
67
+
68
+ # Get the response as a string
69
+ llm = get_llm()
70
+ response = llm.invoke([
71
+ SystemMessage(content=system_prompt),
72
+ HumanMessage(content=f"Here are the search results:\n\n{formatted_results}")
73
+ ])
74
+
75
+ # Extract the JSON part from the response
76
+ response_text = response.content
77
+
78
+ # Find JSON list in the response
79
+ json_match = re.search(r'\[\s*\{.*\}\s*\]', response_text, re.DOTALL)
80
+
81
+ news_items = []
82
+ if json_match:
83
+ try:
84
+ # Parse the JSON text
85
+ news_items = json.loads(json_match.group(0))
86
+ except json.JSONDecodeError:
87
+ # Fallback: create a simple item if JSON parsing fails
88
+ news_items = [{
89
+ "title": "AI News Roundup",
90
+ "url": "https://example.com/ai-news",
91
+ "source": "Various Sources",
92
+ "description": "Compilation of latest AI news from various sources."
93
+ }]
94
+ else:
95
+ # Create a default item if no JSON found
96
+ news_items = [{
97
+ "title": "AI News Roundup",
98
+ "url": "https://example.com/ai-news",
99
+ "source": "Various Sources",
100
+ "description": "Compilation of latest AI news from various sources."
101
+ }]
102
+
103
+ return {"news_items": news_items}
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain-groq
2
+ langgraph
3
+ langchain-core
4
+ langchain-community
5
+ pydantic
6
+ requests
7
+ beautifulsoup4
8
+ gradio
9
+ langdetect
10
+ IPython
11
+ typing-extensions
state/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .news_state import NewsState, ArticleScraperState
2
+ from .blog_state import BlogState, WorkerState
state/blog_state.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing_extensions import TypedDict
2
+ from typing import List, Annotated
3
+ import operator
4
+ from LLMS.models import Section
5
+
6
+ class BlogState(TypedDict):
7
+ content: str
8
+ sections: List[Section]
9
+ completed_sections: Annotated[List, operator.add]
10
+ final_report: str
11
+
12
+ class WorkerState(TypedDict):
13
+ section: Section
14
+ completed_sections: Annotated[List, operator.add]
state/news_state.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing_extensions import TypedDict
2
+ from typing import List, Dict, Any
3
+
4
+ class NewsState(TypedDict):
5
+ query: str
6
+ date: str
7
+ search_results: List[Dict[str, Any]]
8
+ news_items: List[Dict[str, Any]]
9
+
10
+ class ArticleScraperState(TypedDict):
11
+ url: str
12
+ article_content: str
tools/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .search import get_search_tool
2
+ from .scraper import scrape_article
3
+ from .language_detector import is_english
tools/language_detector.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langdetect import detect
2
+
3
+ def is_english(text):
4
+ """
5
+ Detects if text is in English
6
+
7
+ Args:
8
+ text (str): The text to analyze
9
+
10
+ Returns:
11
+ bool: True if text is in English, False otherwise
12
+ """
13
+ # Ensure we have enough text to analyze
14
+ if not text or len(text.strip()) < 50:
15
+ return False
16
+
17
+ try:
18
+ # Try primary language detection
19
+ return detect(text) == 'en'
20
+ except:
21
+ # If detection fails, use a more robust approach
22
+ common_english_words = ['the', 'and', 'in', 'to', 'of', 'is', 'for', 'with', 'on', 'that',
23
+ 'this', 'are', 'was', 'be', 'have', 'it', 'not', 'they', 'by', 'from']
24
+ text_lower = text.lower()
25
+ # Count occurrences of common English words
26
+ english_word_count = sum(1 for word in common_english_words if f" {word} " in f" {text_lower} ")
27
+ # Calculate ratio of English words to text length
28
+ text_words = len(text_lower.split())
29
+ if text_words == 0: # Avoid division by zero
30
+ return False
31
+
32
+ english_ratio = english_word_count / min(20, text_words) # Cap at 20 to avoid skew
33
+ return english_word_count >= 5 or english_ratio > 0.25 # More stringent criteria
tools/scraper.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import re
4
+ from .language_detector import is_english
5
+
6
+ def scrape_article(url):
7
+ """
8
+ Scrape the content from a news article URL
9
+
10
+ Args:
11
+ url (str): The URL to scrape
12
+
13
+ Returns:
14
+ str: The extracted article content or error message
15
+ """
16
+ try:
17
+ headers = {
18
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
19
+ }
20
+ response = requests.get(url, headers=headers, timeout=10)
21
+ response.raise_for_status()
22
+
23
+ soup = BeautifulSoup(response.text, 'html.parser')
24
+
25
+ # Extract article content - this is a simple implementation
26
+ article_text = ""
27
+
28
+ # Try to find the main article content
29
+ article = soup.find('article')
30
+ if article:
31
+ paragraphs = article.find_all('p')
32
+ else:
33
+ # Fallback to all paragraphs
34
+ paragraphs = soup.find_all('p')
35
+
36
+ # Extract text from paragraphs
37
+ article_text = "\n\n".join([p.get_text().strip() for p in paragraphs])
38
+
39
+ # Clean up the text
40
+ article_text = re.sub(r'\s+', ' ', article_text).strip()
41
+
42
+ # Trim to reasonable length for LLM processing
43
+ if len(article_text) > 10000:
44
+ article_text = article_text[:10000] + "..."
45
+
46
+ # Verify the content is in English
47
+ if not is_english(article_text[:500]): # Check first 500 chars to save processing time
48
+ return "Content not in English or insufficient text to analyze."
49
+
50
+ return article_text
51
+
52
+ except Exception as e:
53
+ return f"Error scraping article: {str(e)}"
tools/search.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.tools.tavily_search import TavilySearchResults
2
+
3
+ def get_search_tool(max_results=10):
4
+ """
5
+ Create and return a TavilySearchResults tool
6
+
7
+ Args:
8
+ max_results (int): Maximum number of search results to return
9
+
10
+ Returns:
11
+ TavilySearchResults: Configured search tool
12
+ """
13
+ return TavilySearchResults(max_results=max_results)
ui/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .display import display_workflow, display_blog
2
+ from .web_ui import create_gradio_interface, launch_ui
ui/display.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from IPython.display import Image, display, Markdown
2
+
3
+ def display_workflow(workflow):
4
+ """
5
+ Display a workflow graph using IPython
6
+
7
+ Args:
8
+ workflow: The workflow to display
9
+ """
10
+ display(Image(workflow.get_graph().draw_mermaid_png()))
11
+
12
+ def display_blog(blog_content):
13
+ """
14
+ Display a blog as markdown using IPython
15
+
16
+ Args:
17
+ blog_content (str): Markdown content to display
18
+ """
19
+ display(Markdown(blog_content))
ui/web_ui.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from datetime import datetime
3
+ from core import generate_ai_news_blog
4
+
5
+ def create_gradio_interface():
6
+ """
7
+ Create a Gradio interface for the AI News Blog Generator
8
+
9
+ Returns:
10
+ gr.Blocks: Gradio interface
11
+ """
12
+ def run_generation(groq_key, tavily_key, selected_date):
13
+ if not groq_key or not tavily_key:
14
+ return "Please provide both API keys."
15
+
16
+ try:
17
+ result = generate_ai_news_blog(groq_key, tavily_key, selected_date)
18
+ return result
19
+ except Exception as e:
20
+ return f"Error generating blog: {str(e)}"
21
+
22
+ # Create the interface
23
+ with gr.Blocks(title="AI News Blog Generator") as demo:
24
+ gr.Markdown("# AI News Blog Generator")
25
+ gr.Markdown("Generate a daily roundup of AI news articles, categorized by topic.")
26
+
27
+ with gr.Row():
28
+ with gr.Column():
29
+ groq_key = gr.Textbox(label="Groq API Key", placeholder="Enter your Groq API key", type="password")
30
+ tavily_key = gr.Textbox(label="Tavily API Key", placeholder="Enter your Tavily API key", type="password")
31
+ date_picker = gr.Textbox(label="Date (YYYY-MM-DD)", placeholder="Leave empty for today's date",
32
+ value=datetime.now().strftime("%Y-%m-%d"))
33
+ with gr.Row():
34
+ generate_button = gr.Button("Generate AI News Blog", variant="primary")
35
+ clear_button = gr.Button("Clear Output")
36
+
37
+ with gr.Column():
38
+ status_text = gr.Textbox(label="Status", placeholder="Ready to generate", interactive=False)
39
+ output_md = gr.Markdown("Your AI News Blog will appear here.")
40
+
41
+ # Add loading state and status updates
42
+ generate_button.click(
43
+ fn=lambda: "Generating AI News Blog... This may take several minutes.",
44
+ inputs=None,
45
+ outputs=status_text,
46
+ queue=False
47
+ ).then(
48
+ fn=run_generation,
49
+ inputs=[groq_key, tavily_key, date_picker],
50
+ outputs=output_md
51
+ ).then(
52
+ fn=lambda: "Blog generation complete!",
53
+ inputs=None,
54
+ outputs=status_text
55
+ )
56
+
57
+ # Clear output
58
+ clear_button.click(
59
+ fn=lambda: ("Ready to generate", ""),
60
+ inputs=None,
61
+ outputs=[status_text, output_md]
62
+ )
63
+
64
+ return demo
65
+
66
+ def launch_ui():
67
+ """Launch the Gradio web UI"""
68
+ demo = create_gradio_interface()
69
+ demo.launch()
vectorstore/__init__.py ADDED
File without changes