Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -113,6 +113,131 @@ def api_search_images():
|
|
113 |
'error': str(e)
|
114 |
}), 500
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
if __name__ == '__main__':
|
117 |
app.run(host='0.0.0.0', port=5000)
|
118 |
|
|
|
113 |
'error': str(e)
|
114 |
}), 500
|
115 |
|
116 |
+
def scrape_site_content(query, num_sites=5):
|
117 |
+
headers = {
|
118 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
119 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
120 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
121 |
+
'Accept-Encoding': 'gzip, deflate',
|
122 |
+
'DNT': '1',
|
123 |
+
'Connection': 'keep-alive',
|
124 |
+
}
|
125 |
+
|
126 |
+
results = []
|
127 |
+
scraped = 0
|
128 |
+
|
129 |
+
try:
|
130 |
+
# First, search Google for the query
|
131 |
+
search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}"
|
132 |
+
search_response = requests.get(search_url, headers=headers, timeout=30)
|
133 |
+
search_response.raise_for_status()
|
134 |
+
|
135 |
+
# Parse the search results
|
136 |
+
search_soup = BeautifulSoup(search_response.text, 'html.parser')
|
137 |
+
search_results = search_soup.find_all('div', class_='g')
|
138 |
+
|
139 |
+
# Extract URLs from search results
|
140 |
+
for result in search_results:
|
141 |
+
if scraped >= num_sites:
|
142 |
+
break
|
143 |
+
|
144 |
+
link = result.find('a')
|
145 |
+
if not link:
|
146 |
+
continue
|
147 |
+
|
148 |
+
url = link.get('href', '')
|
149 |
+
if not url.startswith(('http://', 'https://')):
|
150 |
+
continue
|
151 |
+
|
152 |
+
try:
|
153 |
+
# Get the HTML content
|
154 |
+
response = requests.get(url, headers=headers, timeout=30)
|
155 |
+
response.raise_for_status()
|
156 |
+
|
157 |
+
# Parse the HTML content
|
158 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
159 |
+
|
160 |
+
# Remove script and style elements
|
161 |
+
for script in soup(["script", "style"]):
|
162 |
+
script.decompose()
|
163 |
+
|
164 |
+
# Extract text content
|
165 |
+
text_content = soup.get_text(separator='\n', strip=True)
|
166 |
+
|
167 |
+
# Extract all links
|
168 |
+
links = []
|
169 |
+
for link in soup.find_all('a', href=True):
|
170 |
+
href = link['href']
|
171 |
+
if href.startswith('http'): # Only include absolute URLs
|
172 |
+
links.append({
|
173 |
+
'text': link.get_text(strip=True),
|
174 |
+
'url': href
|
175 |
+
})
|
176 |
+
|
177 |
+
# Extract meta information
|
178 |
+
title = soup.title.string if soup.title else ''
|
179 |
+
meta_description = ''
|
180 |
+
meta_keywords = ''
|
181 |
+
|
182 |
+
meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
|
183 |
+
if meta_desc_tag:
|
184 |
+
meta_description = meta_desc_tag.get('content', '')
|
185 |
+
|
186 |
+
meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
|
187 |
+
if meta_keywords_tag:
|
188 |
+
meta_keywords = meta_keywords_tag.get('content', '')
|
189 |
+
|
190 |
+
results.append({
|
191 |
+
'url': url,
|
192 |
+
'title': title,
|
193 |
+
'meta_description': meta_description,
|
194 |
+
'meta_keywords': meta_keywords,
|
195 |
+
'text_content': text_content,
|
196 |
+
'links': links
|
197 |
+
})
|
198 |
+
|
199 |
+
scraped += 1
|
200 |
+
# Add a random delay between scrapes
|
201 |
+
time.sleep(random.uniform(0.5, 1))
|
202 |
+
|
203 |
+
except Exception as e:
|
204 |
+
print(f"Error scraping {url}: {str(e)}")
|
205 |
+
continue
|
206 |
+
|
207 |
+
except Exception as e:
|
208 |
+
print(f"Error in search: {str(e)}")
|
209 |
+
|
210 |
+
return results
|
211 |
+
|
212 |
+
@app.route('/scrape_sites', methods=['GET'])
|
213 |
+
def api_scrape_sites():
|
214 |
+
try:
|
215 |
+
# Get query parameters
|
216 |
+
query = request.args.get('query', '')
|
217 |
+
num_sites = int(request.args.get('num_sites', 5))
|
218 |
+
|
219 |
+
if not query:
|
220 |
+
return jsonify({'error': 'Query parameter is required'}), 400
|
221 |
+
|
222 |
+
if num_sites < 1 or num_sites > 20:
|
223 |
+
return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400
|
224 |
+
|
225 |
+
# Scrape the websites
|
226 |
+
results = scrape_site_content(query, num_sites)
|
227 |
+
|
228 |
+
return jsonify({
|
229 |
+
'success': True,
|
230 |
+
'query': query,
|
231 |
+
'results': results
|
232 |
+
})
|
233 |
+
|
234 |
+
except Exception as e:
|
235 |
+
return jsonify({
|
236 |
+
'success': False,
|
237 |
+
'error': str(e)
|
238 |
+
}), 500
|
239 |
+
|
240 |
+
|
241 |
if __name__ == '__main__':
|
242 |
app.run(host='0.0.0.0', port=5000)
|
243 |
|