jonathanmichael commited on
Commit
5d579ec
·
verified ·
1 Parent(s): 415e279

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -11
app.py CHANGED
@@ -9,24 +9,60 @@ from Gradio_UI import GradioUI
9
 
10
  # Below is an example of a tool that scrapes webpages for content!
11
  @tool
12
- def scrape_webpage_content(url: str, css_selector: str = None) -> str:
13
- """Scrapes and extracts content from a webpage with optional CSS selector filtering.
14
  Args:
15
  url: The webpage URL to scrape
16
- css_selector: Optional CSS selector to extract specific content
17
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  try:
19
- response = requests.get(url, timeout=10)
20
- from bs4 import BeautifulSoup
 
 
 
 
21
  soup = BeautifulSoup(response.content, 'html.parser')
22
 
23
- if css_selector:
24
- elements = soup.select(css_selector)
25
- return '\n'.join([elem.get_text().strip() for elem in elements])
26
- else:
27
- return soup.get_text()[:2000] # Limit output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  except Exception as e:
29
- return f"Error scraping {url}: {str(e)}"
30
 
31
 
32
  @tool
 
9
 
10
  # Below is an example of a tool that scrapes webpages for content!
11
  @tool
12
+ def smart_webpage_scraper(url: str, selector: str = None) -> str:
13
+ """Intelligently scrapes webpage content with anti-detection measures.
14
  Args:
15
  url: The webpage URL to scrape
16
+ selector: Optional CSS selector for specific content
17
  """
18
+ import requests
19
+ from bs4 import BeautifulSoup
20
+ import time
21
+ import random
22
+
23
+ headers = {
24
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
25
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
26
+ 'Accept-Language': 'en-US,en;q=0.5',
27
+ 'Accept-Encoding': 'gzip, deflate',
28
+ 'Connection': 'keep-alive'
29
+ }
30
+
31
  try:
32
+ # Add random delay to avoid detection
33
+ time.sleep(random.uniform(1, 3))
34
+
35
+ response = requests.get(url, headers=headers, timeout=15)
36
+ response.raise_for_status()
37
+
38
  soup = BeautifulSoup(response.content, 'html.parser')
39
 
40
+ # Remove script and style elements
41
+ for script in soup(["script", "style"]):
42
+ script.decompose()
43
+
44
+ if selector:
45
+ elements = soup.select(selector)
46
+ if elements:
47
+ return '\n'.join([elem.get_text().strip() for elem in elements])
48
+ else:
49
+ return f"No elements found with selector: {selector}"
50
+
51
+ # Get main content, avoid headers/footers
52
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') or soup.body
53
+
54
+ if main_content:
55
+ text = main_content.get_text(separator='\n', strip=True)
56
+ # Limit output and clean up
57
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
58
+ return '\n'.join(lines[:100]) # First 100 meaningful lines
59
+
60
+ return "Could not extract meaningful content"
61
+
62
+ except requests.exceptions.RequestException as e:
63
+ return f"Network error accessing {url}: {str(e)}"
64
  except Exception as e:
65
+ return f"Parsing error for {url}: {str(e)}"
66
 
67
 
68
  @tool