First_agent_template

Runtime error

App Files Files Community

jonathanmichael commited on 23 days ago

Commit

7381abe

verified ·

1 Parent(s): 5d579ec

Update app.py

Browse files

BBC Scraper Tool

Files changed (1) hide show

app.py +69 -35

app.py CHANGED Viewed

@@ -7,63 +7,97 @@ from tools.final_answer import FinalAnswerTool
 from Gradio_UI import GradioUI
-# Below is an example of a tool that scrapes webpages for content!
 @tool
-def smart_webpage_scraper(url: str, selector: str = None) -> str:
-    """Intelligently scrapes webpage content with anti-detection measures.
     Args:
-        url: The webpage URL to scrape
-        selector: Optional CSS selector for specific content
     """
     import requests
     from bs4 import BeautifulSoup
     import time
-    import random
     headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'Accept-Encoding': 'gzip, deflate',
-        'Connection': 'keep-alive'
     }
     try:
-        # Add random delay to avoid detection
-        time.sleep(random.uniform(1, 3))
-        response = requests.get(url, headers=headers, timeout=15)
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
-        # Remove script and style elements
-        for script in soup(["script", "style"]):
-            script.decompose()
-        if selector:
-            elements = soup.select(selector)
-            if elements:
-                return '\n'.join([elem.get_text().strip() for elem in elements])
-            else:
-                return f"No elements found with selector: {selector}"
-        # Get main content, avoid headers/footers
-        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') or soup.body
-        if main_content:
-            text = main_content.get_text(separator='\n', strip=True)
-            # Limit output and clean up
-            lines = [line.strip() for line in text.split('\n') if line.strip()]
-            return '\n'.join(lines[:100])  # First 100 meaningful lines
-        return "Could not extract meaningful content"
     except requests.exceptions.RequestException as e:
-        return f"Network error accessing {url}: {str(e)}"
     except Exception as e:
-        return f"Parsing error for {url}: {str(e)}"
 @tool
 def get_current_time_in_timezone(timezone: str) -> str:

 from Gradio_UI import GradioUI
 @tool
+def get_bbc_headline(section: str = "news") -> str:
+    """Extracts the main headline from BBC website sections.
     Args:
+        section: BBC section to check - 'news', 'sport', 'business', 'technology'
     """
     import requests
     from bs4 import BeautifulSoup
     import time
+    # Map sections to URLs
+    bbc_urls = {
+        "news": "https://www.bbc.co.uk/news",
+        "sport": "https://www.bbc.co.uk/sport",
+        "business": "https://www.bbc.co.uk/news/business",
+        "technology": "https://www.bbc.co.uk/news/technology",
+        "world": "https://www.bbc.co.uk/news/world"
+    }
+    if section not in bbc_urls:
+        return f"Invalid section. Available: {', '.join(bbc_urls.keys())}"
+    url = bbc_urls[section]
     headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-GB,en;q=0.5',
+        'Cache-Control': 'no-cache'
     }
     try:
+        response = requests.get(url, headers=headers, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
+        # BBC headline selectors (they change frequently, so multiple fallbacks)
+        headline_selectors = [
+            'h1[data-testid="headline"]',  # Current BBC format
+            'h1.gs-u-mt0',                 # Alternative format
+            '.media__title',               # Story format
+            'h1',                          # Generic fallback
+            '.gs-c-promo-heading__title'   # Promo heading
+        ]
+        main_headline = None
+        # Try each selector until we find a headline
+        for selector in headline_selectors:
+            headlines = soup.select(selector)
+            if headlines:
+                # Get the first meaningful headline
+                for headline in headlines:
+                    text = headline.get_text().strip()
+                    if len(text) > 10:  # Filter out short/empty headlines
+                        main_headline = text
+                        break
+                if main_headline:
+                    break
+        if main_headline:
+            # Get timestamp for context
+            timestamp = time.strftime("%Y-%m-%d %H:%M:%S UTC")
+            return f"BBC {section.title()} - Main Headline ({timestamp}):\n{main_headline}"
+        else:
+            return f"Could not extract headline from BBC {section} section. Site structure may have changed."
     except requests.exceptions.RequestException as e:
+        return f"Error accessing BBC {section}: {str(e)}"
     except Exception as e:
+        return f"Error parsing BBC {section}: {str(e)}"
+# Test function to validate the tool works
+def test_bbc_headline_tool():
+    """Test the BBC headline extraction on multiple sections"""
+    sections = ["news", "sport", "business"]
+    print("🔍 Testing BBC Headline Extraction")
+    print("=" * 40)
+    for section in sections:
+        print(f"\n📰 Testing {section.upper()} section...")
+        result = get_bbc_headline(section)
+        print(f"Result: {result[:100]}...")
+    print("\n✅ Test completed")
+if __name__ == "__main__":
+    test_bbc_headline_tool()
 @tool
 def get_current_time_in_timezone(timezone: str) -> str: