jonathanmichael commited on
Commit
7381abe
·
verified ·
1 Parent(s): 5d579ec

Update app.py

Browse files

BBC Scraper Tool

Files changed (1) hide show
  1. app.py +69 -35
app.py CHANGED
@@ -7,63 +7,97 @@ from tools.final_answer import FinalAnswerTool
7
 
8
  from Gradio_UI import GradioUI
9
 
10
- # Below is an example of a tool that scrapes webpages for content!
11
  @tool
12
- def smart_webpage_scraper(url: str, selector: str = None) -> str:
13
- """Intelligently scrapes webpage content with anti-detection measures.
 
14
  Args:
15
- url: The webpage URL to scrape
16
- selector: Optional CSS selector for specific content
17
  """
18
  import requests
19
  from bs4 import BeautifulSoup
20
  import time
21
- import random
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  headers = {
24
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
25
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
26
- 'Accept-Language': 'en-US,en;q=0.5',
27
- 'Accept-Encoding': 'gzip, deflate',
28
- 'Connection': 'keep-alive'
29
  }
30
 
31
  try:
32
- # Add random delay to avoid detection
33
- time.sleep(random.uniform(1, 3))
34
-
35
- response = requests.get(url, headers=headers, timeout=15)
36
  response.raise_for_status()
37
 
38
  soup = BeautifulSoup(response.content, 'html.parser')
39
 
40
- # Remove script and style elements
41
- for script in soup(["script", "style"]):
42
- script.decompose()
 
 
 
 
 
43
 
44
- if selector:
45
- elements = soup.select(selector)
46
- if elements:
47
- return '\n'.join([elem.get_text().strip() for elem in elements])
48
- else:
49
- return f"No elements found with selector: {selector}"
50
 
51
- # Get main content, avoid headers/footers
52
- main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') or soup.body
53
-
54
- if main_content:
55
- text = main_content.get_text(separator='\n', strip=True)
56
- # Limit output and clean up
57
- lines = [line.strip() for line in text.split('\n') if line.strip()]
58
- return '\n'.join(lines[:100]) # First 100 meaningful lines
59
-
60
- return "Could not extract meaningful content"
 
 
61
 
 
 
 
 
 
 
 
62
  except requests.exceptions.RequestException as e:
63
- return f"Network error accessing {url}: {str(e)}"
64
  except Exception as e:
65
- return f"Parsing error for {url}: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
 
 
67
 
68
  @tool
69
  def get_current_time_in_timezone(timezone: str) -> str:
 
7
 
8
  from Gradio_UI import GradioUI
9
 
 
10
  @tool
11
+ def get_bbc_headline(section: str = "news") -> str:
12
+ """Extracts the main headline from BBC website sections.
13
+
14
  Args:
15
+ section: BBC section to check - 'news', 'sport', 'business', 'technology'
 
16
  """
17
  import requests
18
  from bs4 import BeautifulSoup
19
  import time
20
+
21
+ # Map sections to URLs
22
+ bbc_urls = {
23
+ "news": "https://www.bbc.co.uk/news",
24
+ "sport": "https://www.bbc.co.uk/sport",
25
+ "business": "https://www.bbc.co.uk/news/business",
26
+ "technology": "https://www.bbc.co.uk/news/technology",
27
+ "world": "https://www.bbc.co.uk/news/world"
28
+ }
29
+
30
+ if section not in bbc_urls:
31
+ return f"Invalid section. Available: {', '.join(bbc_urls.keys())}"
32
+
33
+ url = bbc_urls[section]
34
 
35
  headers = {
36
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
37
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
38
+ 'Accept-Language': 'en-GB,en;q=0.5',
39
+ 'Cache-Control': 'no-cache'
 
40
  }
41
 
42
  try:
43
+ response = requests.get(url, headers=headers, timeout=10)
 
 
 
44
  response.raise_for_status()
45
 
46
  soup = BeautifulSoup(response.content, 'html.parser')
47
 
48
+ # BBC headline selectors (they change frequently, so multiple fallbacks)
49
+ headline_selectors = [
50
+ 'h1[data-testid="headline"]', # Current BBC format
51
+ 'h1.gs-u-mt0', # Alternative format
52
+ '.media__title', # Story format
53
+ 'h1', # Generic fallback
54
+ '.gs-c-promo-heading__title' # Promo heading
55
+ ]
56
 
57
+ main_headline = None
 
 
 
 
 
58
 
59
+ # Try each selector until we find a headline
60
+ for selector in headline_selectors:
61
+ headlines = soup.select(selector)
62
+ if headlines:
63
+ # Get the first meaningful headline
64
+ for headline in headlines:
65
+ text = headline.get_text().strip()
66
+ if len(text) > 10: # Filter out short/empty headlines
67
+ main_headline = text
68
+ break
69
+ if main_headline:
70
+ break
71
 
72
+ if main_headline:
73
+ # Get timestamp for context
74
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S UTC")
75
+ return f"BBC {section.title()} - Main Headline ({timestamp}):\n{main_headline}"
76
+ else:
77
+ return f"Could not extract headline from BBC {section} section. Site structure may have changed."
78
+
79
  except requests.exceptions.RequestException as e:
80
+ return f"Error accessing BBC {section}: {str(e)}"
81
  except Exception as e:
82
+ return f"Error parsing BBC {section}: {str(e)}"
83
+
84
+ # Test function to validate the tool works
85
+ def test_bbc_headline_tool():
86
+ """Test the BBC headline extraction on multiple sections"""
87
+ sections = ["news", "sport", "business"]
88
+
89
+ print("🔍 Testing BBC Headline Extraction")
90
+ print("=" * 40)
91
+
92
+ for section in sections:
93
+ print(f"\n📰 Testing {section.upper()} section...")
94
+ result = get_bbc_headline(section)
95
+ print(f"Result: {result[:100]}...")
96
+
97
+ print("\n✅ Test completed")
98
 
99
+ if __name__ == "__main__":
100
+ test_bbc_headline_tool()
101
 
102
  @tool
103
  def get_current_time_in_timezone(timezone: str) -> str: