web-scraper-restart

Sleeping

App Files Files Community

Pamudu13 commited on Jan 26

Commit

f5a443c

verified ·

1 Parent(s): befa8d6

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -19

app.py CHANGED Viewed

@@ -127,33 +127,49 @@ def scrape_site_content(query, num_sites=5):
     scraped = 0
     try:
-        # First, search Google for the query
-        search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}"
         search_response = requests.get(search_url, headers=headers, timeout=30)
         search_response.raise_for_status()
         # Parse the search results
         search_soup = BeautifulSoup(search_response.text, 'html.parser')
-        search_results = search_soup.find_all('div', class_='g')
-        # Extract URLs from search results
-        for result in search_results:
-            if scraped >= num_sites:
-                break
-            link = result.find('a')
-            if not link:
-                continue
-            url = link.get('href', '')
-            if not url.startswith(('http://', 'https://')):
-                continue
             try:
                 # Get the HTML content
-                response = requests.get(url, headers=headers, timeout=30)
                 response.raise_for_status()
                 # Parse the HTML content
                 soup = BeautifulSoup(response.text, 'html.parser')
@@ -161,14 +177,14 @@ def scrape_site_content(query, num_sites=5):
                 for script in soup(["script", "style"]):
                     script.decompose()
-                # Extract text content
-                text_content = soup.get_text(separator='\n', strip=True)
-                # Extract all links
                 links = []
-                for link in soup.find_all('a', href=True):
                     href = link['href']
-                    if href.startswith('http'):  # Only include absolute URLs
                         links.append({
                             'text': link.get_text(strip=True),
                             'url': href

     scraped = 0
     try:
+        # Use a more direct search URL format
+        search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}&num={num_sites}"
         search_response = requests.get(search_url, headers=headers, timeout=30)
         search_response.raise_for_status()
         # Parse the search results
         search_soup = BeautifulSoup(search_response.text, 'html.parser')
+        # Look for URLs in multiple possible locations
+        search_results = []
+        # Method 1: Look for cite elements
+        for cite in search_soup.find_all('cite'):
+            url = cite.text.strip()
+            if url.startswith(('http://', 'https://')):
+                search_results.append(url)
+        # Method 2: Look for links with specific attributes
+        for a in search_soup.find_all('a'):
+            href = a.get('href', '')
+            if 'url?q=' in href:
+                url = href.split('url?q=')[1].split('&')[0]
+                if url.startswith(('http://', 'https://')):
+                    search_results.append(urllib.parse.unquote(url))
+        # Remove duplicates while preserving order
+        search_results = list(dict.fromkeys(search_results))
+        # Process each found URL
+        for url in search_results:
+            if scraped >= num_sites:
+                break
             try:
                 # Get the HTML content
+                response = requests.get(url, headers=headers, timeout=10)
                 response.raise_for_status()
+                # Verify it's HTML content
+                content_type = response.headers.get('Content-Type', '').lower()
+                if 'text/html' not in content_type:
+                    continue
                 # Parse the HTML content
                 soup = BeautifulSoup(response.text, 'html.parser')
                 for script in soup(["script", "style"]):
                     script.decompose()
+                # Extract text content (limit to first 1000 characters)
+                text_content = soup.get_text(separator='\n', strip=True)[:1000]
+                # Extract all links (limit to first 10)
                 links = []
+                for link in soup.find_all('a', href=True)[:10]:
                     href = link['href']
+                    if href.startswith('http'):
                         links.append({
                             'text': link.get_text(strip=True),
                             'url': href