Pamudu13 commited on
Commit
bc96608
·
verified ·
1 Parent(s): 3f5c705

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -27
app.py CHANGED
@@ -8,6 +8,7 @@ import time
8
  import random
9
  import base64
10
  from io import BytesIO
 
11
 
12
  app = Flask(__name__)
13
 
@@ -127,33 +128,8 @@ def scrape_site_content(query, num_sites=5):
127
  scraped = 0
128
 
129
  try:
130
- # Use a more direct search URL format
131
- search_url = f"https://www.bing.com/search?q={urllib.parse.quote(query)}&num={num_sites}"
132
- search_response = requests.get(search_url, headers=headers, timeout=30)
133
- search_response.raise_for_status()
134
-
135
- # Parse the search results
136
- search_soup = BeautifulSoup(search_response.text, 'html.parser')
137
-
138
- # Look for URLs in multiple possible locations
139
- search_results = []
140
-
141
- # Method 1: Look for cite elements
142
- for cite in search_soup.find_all('cite'):
143
- url = cite.text.strip()
144
- if url.startswith(('http://', 'https://')):
145
- search_results.append(url)
146
-
147
- # Method 2: Look for links with specific attributes
148
- for a in search_soup.find_all('a'):
149
- href = a.get('href', '')
150
- if 'url?q=' in href:
151
- url = href.split('url?q=')[1].split('&')[0]
152
- if url.startswith(('http://', 'https://')):
153
- search_results.append(urllib.parse.unquote(url))
154
-
155
- # Remove duplicates while preserving order
156
- search_results = list(dict.fromkeys(search_results))
157
 
158
  # Process each found URL
159
  for url in search_results:
 
8
  import random
9
  import base64
10
  from io import BytesIO
11
+ from googlesearch import search
12
 
13
  app = Flask(__name__)
14
 
 
128
  scraped = 0
129
 
130
  try:
131
+ # Use googlesearch-python to get URLs
132
+ search_results = search(query, num_results=num_sites)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  # Process each found URL
135
  for url in search_results: