Pamudu13 commited on
Commit
a2b8ed7
·
verified ·
1 Parent(s): 3edbebc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +260 -189
app.py CHANGED
@@ -1,193 +1,264 @@
1
- '''
2
- # Web Scrapping
3
- [@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
4
- '''
5
-
6
- import os,re, requests, uuid, zipfile, hashlib, shutil
7
- import gradio as gr
8
  from bs4 import BeautifulSoup
9
- from urllib.parse import urljoin, urlparse
10
-
11
- # Function to validate URLs
12
- def validator(url):
13
- parsed = urlparse(url)
14
- return bool(parsed.netloc) and bool(parsed.scheme)
15
-
16
-
17
- # Function to find files on webpage
18
- def finder(url, soup, media_type):
19
- files = []
20
-
21
- # find image files
22
- if media_type == "image":
23
- tags = ['jpg', 'jpeg', 'png', 'svg', 'gif', 'webp', 'tiff', 'psd', 'eps', 'ai', 'indd', 'raw']
24
- for tag in soup.find_all('img'):
25
- file = tag.get('src')
26
- if any(tag in file for tag in tags):
27
- file_url = file
28
- if not validator(file_url):
29
- file_url = urljoin(url, file_url)
30
- files.append(file_url)
31
-
32
- # find text
33
- elif media_type == "text":
34
- text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong', 'pdf', 'txt', 'doc', 'rtf', 'docx']
35
- for tag in text_tags:
36
- for element in soup.find_all(tag):
37
- files.append(element.get_text())
38
-
39
- # find links
40
- else:
41
- for link in soup.find_all('a'):
42
- file = link.get('href')
43
- if media_type in file:
44
- file_url = file
45
- if not validator(file_url):
46
- file_url = urljoin(url, file_url)
47
- files.append(file_url)
48
-
49
- return files
50
-
51
-
52
- # Function to download the files
53
- def downloader(urls, folder_name):
54
- os.makedirs(folder_name, exist_ok=True)
55
- for i, url in enumerate(urls):
56
- response = requests.get(url, stream=True)
57
- file_extension = url.split(".")[-1].split("&")[0]
58
- url_hash = hashlib.md5(url.encode()).hexdigest()
59
- unique_id = str(uuid.uuid4())[:8]
60
- file_name = f'{url_hash}-{unique_id}.{file_extension}'
61
- file_name = file_name[:255]
62
- file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name)
63
- with open(f'{folder_name}/{file_name}', 'wb') as out_file:
64
- out_file.write(response.content)
65
- print(f"Downloaded file: {file_name}")
66
-
67
-
68
- # Function to create zip file
69
- def zipper(folder_name):
70
- if os.listdir(folder_name):
71
- with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
72
- for file in os.listdir(folder_name):
73
- zipf.write(f'{folder_name}/{file}')
74
- return f'{folder_name}.zip'
75
- else:
76
- return ""
77
-
78
-
79
- # Function to access website
80
- def scrapper(url, images=False, text=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  try:
82
- response = requests.get(url, timeout=10)
 
83
  response.raise_for_status()
84
- except (requests.exceptions.RequestException, ValueError):
85
- raise gr.Error(f"Unable to access URL: {url}")
86
- return None, None
87
- soup = BeautifulSoup(response.content, 'html.parser')
88
-
89
- # Clear all the previews folder data
90
- if images:
91
- shutil.rmtree('images', ignore_errors=True)
92
- if text:
93
- shutil.rmtree('text', ignore_errors=True)
94
-
95
- # Add images to the image folder
96
- if images:
97
- image_urls = finder(url, soup, 'image')
98
- os.makedirs('images', exist_ok=True)
99
- if image_urls:
100
- downloader(image_urls, 'images')
101
- else:
102
- raise gr.Error("Found no images.")
103
-
104
- # Add text files to the text folder
105
- if text:
106
- text_content = finder(url, soup, 'text')
107
- os.makedirs('text', exist_ok=True)
108
- if text_content:
109
- with open('text/content.txt', 'w') as text_file:
110
- for line in text_content:
111
- text_file.write(line + '\n')
112
-
113
- # Output folder(s) as zip files
114
- images_zip_file, text_zip_file = None, None
115
- if images and os.path.exists('images') and os.listdir('images'):
116
- images_zip_file = zipper('images')
117
- if text and os.path.exists('text') and os.listdir('text'):
118
- text_zip_file = zipper('text')
119
- return images_zip_file, text_zip_file
120
-
121
-
122
- # Function to find requests errors
123
- def checker(url, media_types):
124
- if not url:
125
- raise gr.Error("URL cannot be empty.")
126
- if not url.startswith("https://"):
127
- raise gr.Error("The URL must begin with https://")
128
- if not media_types:
129
- raise gr.Error("At least one media type must be selected.")
130
  try:
131
- image_file, text_file = scrapper(url, "Images" in media_types, "Text" in media_types)
132
- except requests.exceptions.HTTPError as e:
133
- if e.response.status_code == 403:
134
- raise gr.Error("HTTP Error: Forbidden. Access to the URL is forbidden.")
135
- else:
136
- raise gr.Error(f"HTTP Error: {e.response.status_code}")
137
- except TypeError as e:
138
- raise gr.Error(f"TypeError: {str(e)}")
139
- except (requests.exceptions.RequestException, ValueError):
140
- raise gr.Error(f"Unable to access URL: {url}")
141
- files = []
142
- if "Text" in media_types and not text_file:
143
- raise gr.Error("Found no text.")
144
- if "Images" in media_types and not image_file:
145
- raise gr.Error("Found no images.")
146
- if image_file:
147
- files.append(image_file)
148
- if text_file:
149
- files.append(text_file)
150
-
151
- print(f"Returning downloaded files from {url} in {files} ...")
152
-
153
- return files
154
-
155
- # Gradio Interface
156
- with gr.Blocks(theme="Nymbo/Nymbo_Theme") as app:
157
- title = gr.Markdown('''# Web Scraping 🕵️''')
158
- description = gr.Markdown('''Get all media files from your desired webpages with just a few clicks.''')
159
- with gr.Row():
160
- with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
161
- url_name = gr.Textbox(
162
- placeholder="Enter URL here",
163
- show_label=True,
164
- label="Website",
165
- )
166
-
167
- media_types = gr.CheckboxGroup(
168
- ["Images", "Text"],
169
- value="Images",
170
- label="Media types",
171
- )
172
-
173
- submit_button = gr.Button(
174
- "Submit",
175
- variant="primary",
176
- interactive=True,
177
- )
178
-
179
- with gr.Column(scale=2):
180
- output_files = gr.Files(
181
- label="Output",
182
- elem_id="file-list",
183
- size="lg",
184
- show_label=False,
185
- )
186
-
187
- submit_button.click(
188
- checker,
189
- inputs=[url_name, media_types],
190
- outputs=[output_files],
191
- )
192
-
193
- app.launch()
 
1
+ from flask import Flask, jsonify, request
2
+ import requests
 
 
 
 
 
3
  from bs4 import BeautifulSoup
4
+ import os
5
+ import re
6
+ import urllib.parse
7
+ import time
8
+ import random
9
+ import base64
10
+ from io import BytesIO
11
+ from urllib.parse import urlparse
12
+ import html2text
13
+
14
+ app = Flask(__name__)
15
+
16
+ def search_images(query, num_images=5):
17
+ # Headers to mimic a browser request
18
+ headers = {
19
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
20
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
21
+ 'Accept-Language': 'en-US,en;q=0.5',
22
+ 'Accept-Encoding': 'gzip, deflate',
23
+ 'DNT': '1',
24
+ 'Connection': 'keep-alive',
25
+ }
26
+
27
+ # Format the query for URL
28
+ formatted_query = urllib.parse.quote(query)
29
+
30
+ # Google Images URL
31
+ url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
32
+
33
+ try:
34
+ # Get the HTML content
35
+ response = requests.get(url, headers=headers, timeout=30)
36
+ response.raise_for_status()
37
+
38
+ # Find all image URLs using regex
39
+ image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)
40
+
41
+ # Remove duplicates while preserving order
42
+ image_urls = list(dict.fromkeys(image_urls))
43
+
44
+ # Store results
45
+ results = []
46
+ downloaded = 0
47
+
48
+ for img_url in image_urls:
49
+ if downloaded >= num_images:
50
+ break
51
+
52
+ try:
53
+ # Skip small thumbnails and icons
54
+ if 'gstatic.com' in img_url or 'google.com' in img_url:
55
+ continue
56
+
57
+ # Download image
58
+ img_response = requests.get(img_url, headers=headers, timeout=10)
59
+ img_response.raise_for_status()
60
+
61
+ # Check if the response is actually an image
62
+ content_type = img_response.headers.get('Content-Type', '')
63
+ if not content_type.startswith('image/'):
64
+ continue
65
+
66
+ # Convert image to base64
67
+ image_base64 = base64.b64encode(img_response.content).decode('utf-8')
68
+
69
+ # Add to results
70
+ results.append({
71
+ 'image_url': img_url,
72
+ 'base64_data': f"data:{content_type};base64,{image_base64}"
73
+ })
74
+
75
+ downloaded += 1
76
+
77
+ # Add a random delay between downloads
78
+ time.sleep(random.uniform(0.5, 1))
79
+
80
+ except Exception as e:
81
+ print(f"Error downloading image: {str(e)}")
82
+ continue
83
+
84
+ return results
85
+
86
+ except Exception as e:
87
+ print(f"An error occurred: {str(e)}")
88
+ return []
89
+
90
+ @app.route('/search_images', methods=['GET'])
91
+ def api_search_images():
92
+ try:
93
+ # Get query parameters
94
+ query = request.args.get('query', '')
95
+ num_images = int(request.args.get('num_images', 5))
96
+
97
+ if not query:
98
+ return jsonify({'error': 'Query parameter is required'}), 400
99
+
100
+ if num_images < 1 or num_images > 20:
101
+ return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
102
+
103
+ # Search for images
104
+ results = search_images(query, num_images)
105
+
106
+ return jsonify({
107
+ 'success': True,
108
+ 'query': query,
109
+ 'results': results
110
+ })
111
+
112
+ except Exception as e:
113
+ return jsonify({
114
+ 'success': False,
115
+ 'error': str(e)
116
+ }), 500
117
+
118
+ def get_domain(url):
119
+ """Extract domain from URL"""
120
+ parsed_uri = urlparse(url)
121
+ return parsed_uri.netloc
122
+
123
+ def clean_text(text):
124
+ """Clean scraped text"""
125
+ # Remove extra whitespace
126
+ text = re.sub(r'\s+', ' ', text)
127
+ # Remove special characters
128
+ text = re.sub(r'[^\w\s.,!?-]', '', text)
129
+ return text.strip()
130
+
131
+ def scrape_website(url, headers):
132
+ """Scrape content from a single website"""
133
+ try:
134
+ response = requests.get(url, headers=headers, timeout=10)
135
+ response.raise_for_status()
136
+
137
+ soup = BeautifulSoup(response.text, 'html.parser')
138
+
139
+ # Remove unwanted elements
140
+ for element in soup(['script', 'style', 'nav', 'footer', 'iframe']):
141
+ element.decompose()
142
+
143
+ # Convert HTML to text
144
+ h = html2text.HTML2Text()
145
+ h.ignore_links = True
146
+ h.ignore_images = True
147
+ text = h.handle(str(soup))
148
+
149
+ # Clean the text
150
+ text = clean_text(text)
151
+
152
+ # Get meta description
153
+ meta_desc = ''
154
+ meta_tag = soup.find('meta', attrs={'name': 'description'}) or soup.find('meta', attrs={'property': 'og:description'})
155
+ if meta_tag:
156
+ meta_desc = meta_tag.get('content', '')
157
+
158
+ # Get title
159
+ title = soup.title.string if soup.title else ''
160
+
161
+ return {
162
+ 'title': clean_text(title),
163
+ 'meta_description': clean_text(meta_desc),
164
+ 'content': text[:1000], # Limit content length
165
+ 'url': url
166
+ }
167
+
168
+ except Exception as e:
169
+ print(f"Error scraping {url}: {str(e)}")
170
+ return None
171
+
172
+ def search_and_scrape(query, num_results=5):
173
+ headers = {
174
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
175
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
176
+ 'Accept-Language': 'en-US,en;q=0.5',
177
+ 'Accept-Encoding': 'gzip, deflate',
178
+ 'DNT': '1',
179
+ 'Connection': 'keep-alive',
180
+ }
181
+
182
+ # Format the query for URL
183
+ formatted_query = urllib.parse.quote(query)
184
+
185
+ # Google Search URL
186
+ url = f"https://www.google.com/search?q={formatted_query}&num={num_results}"
187
+
188
  try:
189
+ # Get Google search results
190
+ response = requests.get(url, headers=headers, timeout=30)
191
  response.raise_for_status()
192
+
193
+ soup = BeautifulSoup(response.text, 'html.parser')
194
+
195
+ # Find all search result divs
196
+ search_results = []
197
+ result_divs = soup.find_all('div', class_='g')
198
+
199
+ for div in result_divs:
200
+ # Find the link
201
+ link = div.find('a')
202
+ if not link:
203
+ continue
204
+
205
+ href = link.get('href', '')
206
+
207
+ # Skip if not a valid URL or if it's a Google-related URL
208
+ if not href.startswith('http') or 'google.' in href:
209
+ continue
210
+
211
+ # Add random delay between requests
212
+ time.sleep(random.uniform(1, 2))
213
+
214
+ # Scrape the website
215
+ site_data = scrape_website(href, headers)
216
+ if site_data:
217
+ search_results.append(site_data)
218
+
219
+ if len(search_results) >= num_results:
220
+ break
221
+
222
+ return search_results
223
+
224
+ except Exception as e:
225
+ print(f"An error occurred: {str(e)}")
226
+ return []
227
+
228
+ @app.route('/scrape_sites', methods=['GET'])
229
+ def api_scrape_sites():
 
 
 
 
 
 
 
 
230
  try:
231
+ # Get query parameters
232
+ query = request.args.get('query', '')
233
+ num_results = int(request.args.get('num_results', 5))
234
+
235
+ if not query:
236
+ return jsonify({'error': 'Query parameter is required'}), 400
237
+
238
+ if num_results < 1 or num_results > 10:
239
+ return jsonify({'error': 'Number of results must be between 1 and 10'}), 400
240
+
241
+ # Search and scrape sites
242
+ results = search_and_scrape(query, num_results)
243
+
244
+ return jsonify({
245
+ 'success': True,
246
+ 'query': query,
247
+ 'results': results
248
+ })
249
+
250
+ except Exception as e:
251
+ return jsonify({
252
+ 'success': False,
253
+ 'error': str(e)
254
+ }), 500
255
+
256
+ if __name__ == "__main__":
257
+ app.run(debug=True, port=5000)
258
+
259
+
260
+
261
+
262
+
263
+
264
+