Pamudu13 commited on
Commit
befa8d6
·
verified ·
1 Parent(s): b868160

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -0
app.py CHANGED
@@ -113,6 +113,131 @@ def api_search_images():
113
  'error': str(e)
114
  }), 500
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  if __name__ == '__main__':
117
  app.run(host='0.0.0.0', port=5000)
118
 
 
113
  'error': str(e)
114
  }), 500
115
 
116
+ def scrape_site_content(query, num_sites=5):
117
+ headers = {
118
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
119
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
120
+ 'Accept-Language': 'en-US,en;q=0.5',
121
+ 'Accept-Encoding': 'gzip, deflate',
122
+ 'DNT': '1',
123
+ 'Connection': 'keep-alive',
124
+ }
125
+
126
+ results = []
127
+ scraped = 0
128
+
129
+ try:
130
+ # First, search Google for the query
131
+ search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}"
132
+ search_response = requests.get(search_url, headers=headers, timeout=30)
133
+ search_response.raise_for_status()
134
+
135
+ # Parse the search results
136
+ search_soup = BeautifulSoup(search_response.text, 'html.parser')
137
+ search_results = search_soup.find_all('div', class_='g')
138
+
139
+ # Extract URLs from search results
140
+ for result in search_results:
141
+ if scraped >= num_sites:
142
+ break
143
+
144
+ link = result.find('a')
145
+ if not link:
146
+ continue
147
+
148
+ url = link.get('href', '')
149
+ if not url.startswith(('http://', 'https://')):
150
+ continue
151
+
152
+ try:
153
+ # Get the HTML content
154
+ response = requests.get(url, headers=headers, timeout=30)
155
+ response.raise_for_status()
156
+
157
+ # Parse the HTML content
158
+ soup = BeautifulSoup(response.text, 'html.parser')
159
+
160
+ # Remove script and style elements
161
+ for script in soup(["script", "style"]):
162
+ script.decompose()
163
+
164
+ # Extract text content
165
+ text_content = soup.get_text(separator='\n', strip=True)
166
+
167
+ # Extract all links
168
+ links = []
169
+ for link in soup.find_all('a', href=True):
170
+ href = link['href']
171
+ if href.startswith('http'): # Only include absolute URLs
172
+ links.append({
173
+ 'text': link.get_text(strip=True),
174
+ 'url': href
175
+ })
176
+
177
+ # Extract meta information
178
+ title = soup.title.string if soup.title else ''
179
+ meta_description = ''
180
+ meta_keywords = ''
181
+
182
+ meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
183
+ if meta_desc_tag:
184
+ meta_description = meta_desc_tag.get('content', '')
185
+
186
+ meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
187
+ if meta_keywords_tag:
188
+ meta_keywords = meta_keywords_tag.get('content', '')
189
+
190
+ results.append({
191
+ 'url': url,
192
+ 'title': title,
193
+ 'meta_description': meta_description,
194
+ 'meta_keywords': meta_keywords,
195
+ 'text_content': text_content,
196
+ 'links': links
197
+ })
198
+
199
+ scraped += 1
200
+ # Add a random delay between scrapes
201
+ time.sleep(random.uniform(0.5, 1))
202
+
203
+ except Exception as e:
204
+ print(f"Error scraping {url}: {str(e)}")
205
+ continue
206
+
207
+ except Exception as e:
208
+ print(f"Error in search: {str(e)}")
209
+
210
+ return results
211
+
212
+ @app.route('/scrape_sites', methods=['GET'])
213
+ def api_scrape_sites():
214
+ try:
215
+ # Get query parameters
216
+ query = request.args.get('query', '')
217
+ num_sites = int(request.args.get('num_sites', 5))
218
+
219
+ if not query:
220
+ return jsonify({'error': 'Query parameter is required'}), 400
221
+
222
+ if num_sites < 1 or num_sites > 20:
223
+ return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400
224
+
225
+ # Scrape the websites
226
+ results = scrape_site_content(query, num_sites)
227
+
228
+ return jsonify({
229
+ 'success': True,
230
+ 'query': query,
231
+ 'results': results
232
+ })
233
+
234
+ except Exception as e:
235
+ return jsonify({
236
+ 'success': False,
237
+ 'error': str(e)
238
+ }), 500
239
+
240
+
241
  if __name__ == '__main__':
242
  app.run(host='0.0.0.0', port=5000)
243