Removed the hastags
Browse files
app.py
CHANGED
@@ -66,14 +66,27 @@ class ArticleScraperState(TypedDict):
|
|
66 |
|
67 |
# Helper function to detect English language
|
68 |
def is_english(text):
|
|
|
|
|
|
|
|
|
69 |
try:
|
|
|
70 |
return detect(text) == 'en'
|
71 |
except:
|
72 |
-
# If detection fails,
|
73 |
-
common_english_words = ['the', 'and', 'in', 'to', 'of', 'is', 'for', 'with', 'on', 'that'
|
|
|
74 |
text_lower = text.lower()
|
|
|
75 |
english_word_count = sum(1 for word in common_english_words if f" {word} " in f" {text_lower} ")
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
# News search functions
|
79 |
def search_ai_news(state: NewsState):
|
@@ -362,9 +375,8 @@ def llm_call(state: WorkerState):
|
|
362 |
|
363 |
section = state['section']
|
364 |
|
365 |
-
# Generate section header
|
366 |
-
|
367 |
-
section_header = f"## {section.name} {{#{section_id}}}\n\n{section.description}\n"
|
368 |
|
369 |
# If there are subsections, process each one
|
370 |
subsections_content = ""
|
@@ -388,11 +400,8 @@ Keep your response focused on the news item and make it engaging. Use markdown f
|
|
388 |
HumanMessage(content=subsection_prompt)
|
389 |
])
|
390 |
|
391 |
-
#
|
392 |
-
|
393 |
-
|
394 |
-
# Format subsection with title and source
|
395 |
-
formatted_subsection = f"### {subsection.title} {{#{subsection_id}}}\n\n"
|
396 |
formatted_subsection += f"*Source: [{subsection.source}]({subsection.url})*\n\n"
|
397 |
formatted_subsection += subsection_content.content
|
398 |
|
@@ -434,14 +443,15 @@ def synthesizer(state: BlogState):
|
|
434 |
table_of_contents = "## Table of Contents\n\n"
|
435 |
|
436 |
# Find all section headings (## headings)
|
437 |
-
section_matches = re.findall(r'## (
|
438 |
|
439 |
-
for i,
|
440 |
-
# Add section to TOC
|
441 |
-
|
|
|
|
|
442 |
|
443 |
# Find all subsections within this section
|
444 |
-
# Look for subsection headings (### headings) until the next section or end of text
|
445 |
section_start = completed_report.find(f"## {section_name}")
|
446 |
next_section_match = re.search(r'## ', completed_report[section_start+1:])
|
447 |
if next_section_match:
|
@@ -450,12 +460,14 @@ def synthesizer(state: BlogState):
|
|
450 |
else:
|
451 |
section_text = completed_report[section_start:]
|
452 |
|
453 |
-
# Extract subsection headings
|
454 |
-
subsection_matches = re.findall(r'### (
|
455 |
|
456 |
-
for j,
|
|
|
|
|
457 |
# Add subsection to TOC with proper indentation
|
458 |
-
table_of_contents += f" {i}.{j}. [{subsection_name}](#{
|
459 |
|
460 |
final_report = f"{blog_title}\n\n{intro.content}\n\n{table_of_contents}\n\n---\n\n{completed_report}\n\n---\n\n*This AI News Roundup was automatically generated on {today}.*"
|
461 |
|
@@ -559,6 +571,10 @@ def generate_ai_news_blog(groq_api_key=None, tavily_api_key=None, date=None):
|
|
559 |
"content": result["article_content"]
|
560 |
})
|
561 |
|
|
|
|
|
|
|
|
|
562 |
# Format news content for the blog generator
|
563 |
formatted_content = "\n\n".join([
|
564 |
f"TITLE: {item['title']}\nSOURCE: {item['source']}\nURL: {item['url']}\nDESCRIPTION: {item['description']}\nCONTENT: {item['content'][:2000]}..."
|
@@ -599,15 +615,35 @@ def create_gradio_interface():
|
|
599 |
tavily_key = gr.Textbox(label="Tavily API Key", placeholder="Enter your Tavily API key", type="password")
|
600 |
date_picker = gr.Textbox(label="Date (YYYY-MM-DD)", placeholder="Leave empty for today's date",
|
601 |
value=datetime.now().strftime("%Y-%m-%d"))
|
602 |
-
|
|
|
|
|
603 |
|
604 |
with gr.Column():
|
|
|
605 |
output_md = gr.Markdown("Your AI News Blog will appear here.")
|
606 |
|
|
|
607 |
generate_button.click(
|
|
|
|
|
|
|
|
|
|
|
608 |
fn=run_generation,
|
609 |
inputs=[groq_key, tavily_key, date_picker],
|
610 |
outputs=output_md
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
611 |
)
|
612 |
|
613 |
return demo
|
|
|
66 |
|
67 |
# Helper function to detect English language
|
68 |
def is_english(text):
|
69 |
+
# Ensure we have enough text to analyze
|
70 |
+
if not text or len(text.strip()) < 50:
|
71 |
+
return False
|
72 |
+
|
73 |
try:
|
74 |
+
# Try primary language detection
|
75 |
return detect(text) == 'en'
|
76 |
except:
|
77 |
+
# If detection fails, use a more robust approach
|
78 |
+
common_english_words = ['the', 'and', 'in', 'to', 'of', 'is', 'for', 'with', 'on', 'that',
|
79 |
+
'this', 'are', 'was', 'be', 'have', 'it', 'not', 'they', 'by', 'from']
|
80 |
text_lower = text.lower()
|
81 |
+
# Count occurrences of common English words
|
82 |
english_word_count = sum(1 for word in common_english_words if f" {word} " in f" {text_lower} ")
|
83 |
+
# Calculate ratio of English words to text length
|
84 |
+
text_words = len(text_lower.split())
|
85 |
+
if text_words == 0: # Avoid division by zero
|
86 |
+
return False
|
87 |
+
|
88 |
+
english_ratio = english_word_count / min(20, text_words) # Cap at 20 to avoid skew
|
89 |
+
return english_word_count >= 5 or english_ratio > 0.25 # More stringent criteria
|
90 |
|
91 |
# News search functions
|
92 |
def search_ai_news(state: NewsState):
|
|
|
375 |
|
376 |
section = state['section']
|
377 |
|
378 |
+
# Generate section header without ID for cleaner markdown
|
379 |
+
section_header = f"## {section.name}\n\n{section.description}\n"
|
|
|
380 |
|
381 |
# If there are subsections, process each one
|
382 |
subsections_content = ""
|
|
|
400 |
HumanMessage(content=subsection_prompt)
|
401 |
])
|
402 |
|
403 |
+
# Format subsection with title and source (without ID tags)
|
404 |
+
formatted_subsection = f"### {subsection.title}\n\n"
|
|
|
|
|
|
|
405 |
formatted_subsection += f"*Source: [{subsection.source}]({subsection.url})*\n\n"
|
406 |
formatted_subsection += subsection_content.content
|
407 |
|
|
|
443 |
table_of_contents = "## Table of Contents\n\n"
|
444 |
|
445 |
# Find all section headings (## headings)
|
446 |
+
section_matches = re.findall(r'## ([^\n]+)', completed_report)
|
447 |
|
448 |
+
for i, section_name in enumerate(section_matches, 1):
|
449 |
+
# Add section to TOC with auto-generated link
|
450 |
+
# Create a clean anchor from the section name
|
451 |
+
section_anchor = section_name.lower().replace(' ', '-')
|
452 |
+
table_of_contents += f"{i}. [{section_name}](#{section_anchor})\n"
|
453 |
|
454 |
# Find all subsections within this section
|
|
|
455 |
section_start = completed_report.find(f"## {section_name}")
|
456 |
next_section_match = re.search(r'## ', completed_report[section_start+1:])
|
457 |
if next_section_match:
|
|
|
460 |
else:
|
461 |
section_text = completed_report[section_start:]
|
462 |
|
463 |
+
# Extract subsection headings
|
464 |
+
subsection_matches = re.findall(r'### ([^\n]+)', section_text)
|
465 |
|
466 |
+
for j, subsection_name in enumerate(subsection_matches, 1):
|
467 |
+
# Create a clean anchor from the subsection name
|
468 |
+
subsection_anchor = subsection_name.lower().replace(' ', '-').replace(':', '').replace('?', '').replace('!', '').replace('.', '')
|
469 |
# Add subsection to TOC with proper indentation
|
470 |
+
table_of_contents += f" {i}.{j}. [{subsection_name}](#{subsection_anchor})\n"
|
471 |
|
472 |
final_report = f"{blog_title}\n\n{intro.content}\n\n{table_of_contents}\n\n---\n\n{completed_report}\n\n---\n\n*This AI News Roundup was automatically generated on {today}.*"
|
473 |
|
|
|
571 |
"content": result["article_content"]
|
572 |
})
|
573 |
|
574 |
+
# Check if we have any news items
|
575 |
+
if not news_contents:
|
576 |
+
return "No English language AI news items found for the specified date. Please try a different date."
|
577 |
+
|
578 |
# Format news content for the blog generator
|
579 |
formatted_content = "\n\n".join([
|
580 |
f"TITLE: {item['title']}\nSOURCE: {item['source']}\nURL: {item['url']}\nDESCRIPTION: {item['description']}\nCONTENT: {item['content'][:2000]}..."
|
|
|
615 |
tavily_key = gr.Textbox(label="Tavily API Key", placeholder="Enter your Tavily API key", type="password")
|
616 |
date_picker = gr.Textbox(label="Date (YYYY-MM-DD)", placeholder="Leave empty for today's date",
|
617 |
value=datetime.now().strftime("%Y-%m-%d"))
|
618 |
+
with gr.Row():
|
619 |
+
generate_button = gr.Button("Generate AI News Blog", variant="primary")
|
620 |
+
clear_button = gr.Button("Clear Output")
|
621 |
|
622 |
with gr.Column():
|
623 |
+
status_text = gr.Textbox(label="Status", placeholder="Ready to generate", interactive=False)
|
624 |
output_md = gr.Markdown("Your AI News Blog will appear here.")
|
625 |
|
626 |
+
# Add loading state and status updates
|
627 |
generate_button.click(
|
628 |
+
fn=lambda: "Generating AI News Blog... This may take several minutes.",
|
629 |
+
inputs=None,
|
630 |
+
outputs=status_text,
|
631 |
+
queue=False
|
632 |
+
).then(
|
633 |
fn=run_generation,
|
634 |
inputs=[groq_key, tavily_key, date_picker],
|
635 |
outputs=output_md
|
636 |
+
).then(
|
637 |
+
fn=lambda: "Blog generation complete!",
|
638 |
+
inputs=None,
|
639 |
+
outputs=status_text
|
640 |
+
)
|
641 |
+
|
642 |
+
# Clear output
|
643 |
+
clear_button.click(
|
644 |
+
fn=lambda: ("Ready to generate", ""),
|
645 |
+
inputs=None,
|
646 |
+
outputs=[status_text, output_md]
|
647 |
)
|
648 |
|
649 |
return demo
|