First_agent_template

Runtime error

App Files Files Community

jonathanmichael commited on 23 days ago

Commit

5d579ec

verified ·

1 Parent(s): 415e279

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -11

app.py CHANGED Viewed

@@ -9,24 +9,60 @@ from Gradio_UI import GradioUI
 # Below is an example of a tool that scrapes webpages for content!
 @tool
-def scrape_webpage_content(url: str, css_selector: str = None) -> str:
-    """Scrapes and extracts content from a webpage with optional CSS selector filtering.
     Args:
         url: The webpage URL to scrape
-        css_selector: Optional CSS selector to extract specific content
     """
     try:
-        response = requests.get(url, timeout=10)
-        from bs4 import BeautifulSoup
         soup = BeautifulSoup(response.content, 'html.parser')
-        if css_selector:
-            elements = soup.select(css_selector)
-            return '\n'.join([elem.get_text().strip() for elem in elements])
-        else:
-            return soup.get_text()[:2000]  # Limit output
     except Exception as e:
-        return f"Error scraping {url}: {str(e)}"
 @tool

 # Below is an example of a tool that scrapes webpages for content!
 @tool
+def smart_webpage_scraper(url: str, selector: str = None) -> str:
+    """Intelligently scrapes webpage content with anti-detection measures.
     Args:
         url: The webpage URL to scrape
+        selector: Optional CSS selector for specific content
     """
+    import requests
+    from bs4 import BeautifulSoup
+    import time
+    import random
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate',
+        'Connection': 'keep-alive'
+    }
     try:
+        # Add random delay to avoid detection
+        time.sleep(random.uniform(1, 3))
+        response = requests.get(url, headers=headers, timeout=15)
+        response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
+        # Remove script and style elements
+        for script in soup(["script", "style"]):
+            script.decompose()
+        if selector:
+            elements = soup.select(selector)
+            if elements:
+                return '\n'.join([elem.get_text().strip() for elem in elements])
+            else:
+                return f"No elements found with selector: {selector}"
+        # Get main content, avoid headers/footers
+        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') or soup.body
+        if main_content:
+            text = main_content.get_text(separator='\n', strip=True)
+            # Limit output and clean up
+            lines = [line.strip() for line in text.split('\n') if line.strip()]
+            return '\n'.join(lines[:100])  # First 100 meaningful lines
+        return "Could not extract meaningful content"
+    except requests.exceptions.RequestException as e:
+        return f"Network error accessing {url}: {str(e)}"
     except Exception as e:
+        return f"Parsing error for {url}: {str(e)}"
 @tool