Spaces:

Phoenix21
/

AiNews

Running

App Files Files Community

Phoenix21 commited on Mar 7

Commit

065f30a

verified ·

1 Parent(s): 60566a2

Removed the hastags

Browse files

Files changed (1) hide show

app.py +57 -21

app.py CHANGED Viewed

@@ -66,14 +66,27 @@ class ArticleScraperState(TypedDict):
 # Helper function to detect English language
 def is_english(text):
     try:
         return detect(text) == 'en'
     except:
-        # If detection fails, check for common English words
-        common_english_words = ['the', 'and', 'in', 'to', 'of', 'is', 'for', 'with', 'on', 'that']
         text_lower = text.lower()
         english_word_count = sum(1 for word in common_english_words if f" {word} " in f" {text_lower} ")
-        return english_word_count >= 3  # If at least 3 common English words are found
 # News search functions
 def search_ai_news(state: NewsState):
@@ -362,9 +375,8 @@ def llm_call(state: WorkerState):
     section = state['section']
-    # Generate section header with ID for anchor linking
-    section_id = section.name.lower().replace(' ', '-')
-    section_header = f"## {section.name} {{#{section_id}}}\n\n{section.description}\n"
     # If there are subsections, process each one
     subsections_content = ""
@@ -388,11 +400,8 @@ Keep your response focused on the news item and make it engaging. Use markdown f
                 HumanMessage(content=subsection_prompt)
             ])
-            # Create a clean ID for the subsection
-            subsection_id = f"{section_id}-{idx+1}-{subsection.title.lower().replace(' ', '-').replace(':', '').replace('?', '').replace('!', '')}"
-            # Format subsection with title and source
-            formatted_subsection = f"### {subsection.title} {{#{subsection_id}}}\n\n"
             formatted_subsection += f"*Source: [{subsection.source}]({subsection.url})*\n\n"
             formatted_subsection += subsection_content.content
@@ -434,14 +443,15 @@ def synthesizer(state: BlogState):
     table_of_contents = "## Table of Contents\n\n"
     # Find all section headings (## headings)
-    section_matches = re.findall(r'## (.*?) {#(.*?)}', completed_report)
-    for i, (section_name, section_id) in enumerate(section_matches, 1):
-        # Add section to TOC
-        table_of_contents += f"{i}. [{section_name}](#{section_id})\n"
         # Find all subsections within this section
-        # Look for subsection headings (### headings) until the next section or end of text
         section_start = completed_report.find(f"## {section_name}")
         next_section_match = re.search(r'## ', completed_report[section_start+1:])
         if next_section_match:
@@ -450,12 +460,14 @@ def synthesizer(state: BlogState):
         else:
             section_text = completed_report[section_start:]
-        # Extract subsection headings and IDs
-        subsection_matches = re.findall(r'### (.*?) {#(.*?)}', section_text)
-        for j, (subsection_name, subsection_id) in enumerate(subsection_matches, 1):
             # Add subsection to TOC with proper indentation
-            table_of_contents += f"   {i}.{j}. [{subsection_name}](#{subsection_id})\n"
     final_report = f"{blog_title}\n\n{intro.content}\n\n{table_of_contents}\n\n---\n\n{completed_report}\n\n---\n\n*This AI News Roundup was automatically generated on {today}.*"
@@ -559,6 +571,10 @@ def generate_ai_news_blog(groq_api_key=None, tavily_api_key=None, date=None):
             "content": result["article_content"]
         })
     # Format news content for the blog generator
     formatted_content = "\n\n".join([
         f"TITLE: {item['title']}\nSOURCE: {item['source']}\nURL: {item['url']}\nDESCRIPTION: {item['description']}\nCONTENT: {item['content'][:2000]}..."
@@ -599,15 +615,35 @@ def create_gradio_interface():
                 tavily_key = gr.Textbox(label="Tavily API Key", placeholder="Enter your Tavily API key", type="password")
                 date_picker = gr.Textbox(label="Date (YYYY-MM-DD)", placeholder="Leave empty for today's date",
                                         value=datetime.now().strftime("%Y-%m-%d"))
-                generate_button = gr.Button("Generate AI News Blog")
             with gr.Column():
                 output_md = gr.Markdown("Your AI News Blog will appear here.")
         generate_button.click(
             fn=run_generation,
             inputs=[groq_key, tavily_key, date_picker],
             outputs=output_md
         )
     return demo

 # Helper function to detect English language
 def is_english(text):
+    # Ensure we have enough text to analyze
+    if not text or len(text.strip()) < 50:
+        return False
     try:
+        # Try primary language detection
         return detect(text) == 'en'
     except:
+        # If detection fails, use a more robust approach
+        common_english_words = ['the', 'and', 'in', 'to', 'of', 'is', 'for', 'with', 'on', 'that',
+                              'this', 'are', 'was', 'be', 'have', 'it', 'not', 'they', 'by', 'from']
         text_lower = text.lower()
+        # Count occurrences of common English words
         english_word_count = sum(1 for word in common_english_words if f" {word} " in f" {text_lower} ")
+        # Calculate ratio of English words to text length
+        text_words = len(text_lower.split())
+        if text_words == 0:  # Avoid division by zero
+            return False
+        english_ratio = english_word_count / min(20, text_words)  # Cap at 20 to avoid skew
+        return english_word_count >= 5 or english_ratio > 0.25  # More stringent criteria
 # News search functions
 def search_ai_news(state: NewsState):
     section = state['section']
+    # Generate section header without ID for cleaner markdown
+    section_header = f"## {section.name}\n\n{section.description}\n"
     # If there are subsections, process each one
     subsections_content = ""
                 HumanMessage(content=subsection_prompt)
             ])
+            # Format subsection with title and source (without ID tags)
+            formatted_subsection = f"### {subsection.title}\n\n"
             formatted_subsection += f"*Source: [{subsection.source}]({subsection.url})*\n\n"
             formatted_subsection += subsection_content.content
     table_of_contents = "## Table of Contents\n\n"
     # Find all section headings (## headings)
+    section_matches = re.findall(r'## ([^\n]+)', completed_report)
+    for i, section_name in enumerate(section_matches, 1):
+        # Add section to TOC with auto-generated link
+        # Create a clean anchor from the section name
+        section_anchor = section_name.lower().replace(' ', '-')
+        table_of_contents += f"{i}. [{section_name}](#{section_anchor})\n"
         # Find all subsections within this section
         section_start = completed_report.find(f"## {section_name}")
         next_section_match = re.search(r'## ', completed_report[section_start+1:])
         if next_section_match:
         else:
             section_text = completed_report[section_start:]
+        # Extract subsection headings
+        subsection_matches = re.findall(r'### ([^\n]+)', section_text)
+        for j, subsection_name in enumerate(subsection_matches, 1):
+            # Create a clean anchor from the subsection name
+            subsection_anchor = subsection_name.lower().replace(' ', '-').replace(':', '').replace('?', '').replace('!', '').replace('.', '')
             # Add subsection to TOC with proper indentation
+            table_of_contents += f"   {i}.{j}. [{subsection_name}](#{subsection_anchor})\n"
     final_report = f"{blog_title}\n\n{intro.content}\n\n{table_of_contents}\n\n---\n\n{completed_report}\n\n---\n\n*This AI News Roundup was automatically generated on {today}.*"
             "content": result["article_content"]
         })
+    # Check if we have any news items
+    if not news_contents:
+        return "No English language AI news items found for the specified date. Please try a different date."
     # Format news content for the blog generator
     formatted_content = "\n\n".join([
         f"TITLE: {item['title']}\nSOURCE: {item['source']}\nURL: {item['url']}\nDESCRIPTION: {item['description']}\nCONTENT: {item['content'][:2000]}..."
                 tavily_key = gr.Textbox(label="Tavily API Key", placeholder="Enter your Tavily API key", type="password")
                 date_picker = gr.Textbox(label="Date (YYYY-MM-DD)", placeholder="Leave empty for today's date",
                                         value=datetime.now().strftime("%Y-%m-%d"))
+                with gr.Row():
+                    generate_button = gr.Button("Generate AI News Blog", variant="primary")
+                    clear_button = gr.Button("Clear Output")
             with gr.Column():
+                status_text = gr.Textbox(label="Status", placeholder="Ready to generate", interactive=False)
                 output_md = gr.Markdown("Your AI News Blog will appear here.")
+        # Add loading state and status updates
         generate_button.click(
+            fn=lambda: "Generating AI News Blog... This may take several minutes.",
+            inputs=None,
+            outputs=status_text,
+            queue=False
+        ).then(
             fn=run_generation,
             inputs=[groq_key, tavily_key, date_picker],
             outputs=output_md
+        ).then(
+            fn=lambda: "Blog generation complete!",
+            inputs=None,
+            outputs=status_text
+        )
+        # Clear output
+        clear_button.click(
+            fn=lambda: ("Ready to generate", ""),
+            inputs=None,
+            outputs=[status_text, output_md]
         )
     return demo