Phoenix21 commited on
Commit
065f30a
·
verified ·
1 Parent(s): 60566a2

Removed the hastags

Browse files
Files changed (1) hide show
  1. app.py +57 -21
app.py CHANGED
@@ -66,14 +66,27 @@ class ArticleScraperState(TypedDict):
66
 
67
  # Helper function to detect English language
68
  def is_english(text):
 
 
 
 
69
  try:
 
70
  return detect(text) == 'en'
71
  except:
72
- # If detection fails, check for common English words
73
- common_english_words = ['the', 'and', 'in', 'to', 'of', 'is', 'for', 'with', 'on', 'that']
 
74
  text_lower = text.lower()
 
75
  english_word_count = sum(1 for word in common_english_words if f" {word} " in f" {text_lower} ")
76
- return english_word_count >= 3 # If at least 3 common English words are found
 
 
 
 
 
 
77
 
78
  # News search functions
79
  def search_ai_news(state: NewsState):
@@ -362,9 +375,8 @@ def llm_call(state: WorkerState):
362
 
363
  section = state['section']
364
 
365
- # Generate section header with ID for anchor linking
366
- section_id = section.name.lower().replace(' ', '-')
367
- section_header = f"## {section.name} {{#{section_id}}}\n\n{section.description}\n"
368
 
369
  # If there are subsections, process each one
370
  subsections_content = ""
@@ -388,11 +400,8 @@ Keep your response focused on the news item and make it engaging. Use markdown f
388
  HumanMessage(content=subsection_prompt)
389
  ])
390
 
391
- # Create a clean ID for the subsection
392
- subsection_id = f"{section_id}-{idx+1}-{subsection.title.lower().replace(' ', '-').replace(':', '').replace('?', '').replace('!', '')}"
393
-
394
- # Format subsection with title and source
395
- formatted_subsection = f"### {subsection.title} {{#{subsection_id}}}\n\n"
396
  formatted_subsection += f"*Source: [{subsection.source}]({subsection.url})*\n\n"
397
  formatted_subsection += subsection_content.content
398
 
@@ -434,14 +443,15 @@ def synthesizer(state: BlogState):
434
  table_of_contents = "## Table of Contents\n\n"
435
 
436
  # Find all section headings (## headings)
437
- section_matches = re.findall(r'## (.*?) {#(.*?)}', completed_report)
438
 
439
- for i, (section_name, section_id) in enumerate(section_matches, 1):
440
- # Add section to TOC
441
- table_of_contents += f"{i}. [{section_name}](#{section_id})\n"
 
 
442
 
443
  # Find all subsections within this section
444
- # Look for subsection headings (### headings) until the next section or end of text
445
  section_start = completed_report.find(f"## {section_name}")
446
  next_section_match = re.search(r'## ', completed_report[section_start+1:])
447
  if next_section_match:
@@ -450,12 +460,14 @@ def synthesizer(state: BlogState):
450
  else:
451
  section_text = completed_report[section_start:]
452
 
453
- # Extract subsection headings and IDs
454
- subsection_matches = re.findall(r'### (.*?) {#(.*?)}', section_text)
455
 
456
- for j, (subsection_name, subsection_id) in enumerate(subsection_matches, 1):
 
 
457
  # Add subsection to TOC with proper indentation
458
- table_of_contents += f" {i}.{j}. [{subsection_name}](#{subsection_id})\n"
459
 
460
  final_report = f"{blog_title}\n\n{intro.content}\n\n{table_of_contents}\n\n---\n\n{completed_report}\n\n---\n\n*This AI News Roundup was automatically generated on {today}.*"
461
 
@@ -559,6 +571,10 @@ def generate_ai_news_blog(groq_api_key=None, tavily_api_key=None, date=None):
559
  "content": result["article_content"]
560
  })
561
 
 
 
 
 
562
  # Format news content for the blog generator
563
  formatted_content = "\n\n".join([
564
  f"TITLE: {item['title']}\nSOURCE: {item['source']}\nURL: {item['url']}\nDESCRIPTION: {item['description']}\nCONTENT: {item['content'][:2000]}..."
@@ -599,15 +615,35 @@ def create_gradio_interface():
599
  tavily_key = gr.Textbox(label="Tavily API Key", placeholder="Enter your Tavily API key", type="password")
600
  date_picker = gr.Textbox(label="Date (YYYY-MM-DD)", placeholder="Leave empty for today's date",
601
  value=datetime.now().strftime("%Y-%m-%d"))
602
- generate_button = gr.Button("Generate AI News Blog")
 
 
603
 
604
  with gr.Column():
 
605
  output_md = gr.Markdown("Your AI News Blog will appear here.")
606
 
 
607
  generate_button.click(
 
 
 
 
 
608
  fn=run_generation,
609
  inputs=[groq_key, tavily_key, date_picker],
610
  outputs=output_md
 
 
 
 
 
 
 
 
 
 
 
611
  )
612
 
613
  return demo
 
66
 
67
  # Helper function to detect English language
68
  def is_english(text):
69
+ # Ensure we have enough text to analyze
70
+ if not text or len(text.strip()) < 50:
71
+ return False
72
+
73
  try:
74
+ # Try primary language detection
75
  return detect(text) == 'en'
76
  except:
77
+ # If detection fails, use a more robust approach
78
+ common_english_words = ['the', 'and', 'in', 'to', 'of', 'is', 'for', 'with', 'on', 'that',
79
+ 'this', 'are', 'was', 'be', 'have', 'it', 'not', 'they', 'by', 'from']
80
  text_lower = text.lower()
81
+ # Count occurrences of common English words
82
  english_word_count = sum(1 for word in common_english_words if f" {word} " in f" {text_lower} ")
83
+ # Calculate ratio of English words to text length
84
+ text_words = len(text_lower.split())
85
+ if text_words == 0: # Avoid division by zero
86
+ return False
87
+
88
+ english_ratio = english_word_count / min(20, text_words) # Cap at 20 to avoid skew
89
+ return english_word_count >= 5 or english_ratio > 0.25 # More stringent criteria
90
 
91
  # News search functions
92
  def search_ai_news(state: NewsState):
 
375
 
376
  section = state['section']
377
 
378
+ # Generate section header without ID for cleaner markdown
379
+ section_header = f"## {section.name}\n\n{section.description}\n"
 
380
 
381
  # If there are subsections, process each one
382
  subsections_content = ""
 
400
  HumanMessage(content=subsection_prompt)
401
  ])
402
 
403
+ # Format subsection with title and source (without ID tags)
404
+ formatted_subsection = f"### {subsection.title}\n\n"
 
 
 
405
  formatted_subsection += f"*Source: [{subsection.source}]({subsection.url})*\n\n"
406
  formatted_subsection += subsection_content.content
407
 
 
443
  table_of_contents = "## Table of Contents\n\n"
444
 
445
  # Find all section headings (## headings)
446
+ section_matches = re.findall(r'## ([^\n]+)', completed_report)
447
 
448
+ for i, section_name in enumerate(section_matches, 1):
449
+ # Add section to TOC with auto-generated link
450
+ # Create a clean anchor from the section name
451
+ section_anchor = section_name.lower().replace(' ', '-')
452
+ table_of_contents += f"{i}. [{section_name}](#{section_anchor})\n"
453
 
454
  # Find all subsections within this section
 
455
  section_start = completed_report.find(f"## {section_name}")
456
  next_section_match = re.search(r'## ', completed_report[section_start+1:])
457
  if next_section_match:
 
460
  else:
461
  section_text = completed_report[section_start:]
462
 
463
+ # Extract subsection headings
464
+ subsection_matches = re.findall(r'### ([^\n]+)', section_text)
465
 
466
+ for j, subsection_name in enumerate(subsection_matches, 1):
467
+ # Create a clean anchor from the subsection name
468
+ subsection_anchor = subsection_name.lower().replace(' ', '-').replace(':', '').replace('?', '').replace('!', '').replace('.', '')
469
  # Add subsection to TOC with proper indentation
470
+ table_of_contents += f" {i}.{j}. [{subsection_name}](#{subsection_anchor})\n"
471
 
472
  final_report = f"{blog_title}\n\n{intro.content}\n\n{table_of_contents}\n\n---\n\n{completed_report}\n\n---\n\n*This AI News Roundup was automatically generated on {today}.*"
473
 
 
571
  "content": result["article_content"]
572
  })
573
 
574
+ # Check if we have any news items
575
+ if not news_contents:
576
+ return "No English language AI news items found for the specified date. Please try a different date."
577
+
578
  # Format news content for the blog generator
579
  formatted_content = "\n\n".join([
580
  f"TITLE: {item['title']}\nSOURCE: {item['source']}\nURL: {item['url']}\nDESCRIPTION: {item['description']}\nCONTENT: {item['content'][:2000]}..."
 
615
  tavily_key = gr.Textbox(label="Tavily API Key", placeholder="Enter your Tavily API key", type="password")
616
  date_picker = gr.Textbox(label="Date (YYYY-MM-DD)", placeholder="Leave empty for today's date",
617
  value=datetime.now().strftime("%Y-%m-%d"))
618
+ with gr.Row():
619
+ generate_button = gr.Button("Generate AI News Blog", variant="primary")
620
+ clear_button = gr.Button("Clear Output")
621
 
622
  with gr.Column():
623
+ status_text = gr.Textbox(label="Status", placeholder="Ready to generate", interactive=False)
624
  output_md = gr.Markdown("Your AI News Blog will appear here.")
625
 
626
+ # Add loading state and status updates
627
  generate_button.click(
628
+ fn=lambda: "Generating AI News Blog... This may take several minutes.",
629
+ inputs=None,
630
+ outputs=status_text,
631
+ queue=False
632
+ ).then(
633
  fn=run_generation,
634
  inputs=[groq_key, tavily_key, date_picker],
635
  outputs=output_md
636
+ ).then(
637
+ fn=lambda: "Blog generation complete!",
638
+ inputs=None,
639
+ outputs=status_text
640
+ )
641
+
642
+ # Clear output
643
+ clear_button.click(
644
+ fn=lambda: ("Ready to generate", ""),
645
+ inputs=None,
646
+ outputs=[status_text, output_md]
647
  )
648
 
649
  return demo