File size: 14,394 Bytes
4ce93da
 
b868160
3e48a1e
b868160
 
 
a2b8ed7
 
b868160
a2b8ed7
bc96608
4ce93da
 
6f31366
4ce93da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2b8ed7
 
4ce93da
 
 
 
 
 
 
 
a2b8ed7
4dd1d1c
b868160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c771ba
4dd1d1c
b868160
 
 
0c771ba
b868160
 
0c771ba
b868160
 
0c771ba
b868160
 
 
0c771ba
b868160
 
 
0c771ba
b868160
 
 
a2b8ed7
 
b868160
 
 
a2b8ed7
b868160
 
 
 
a2b8ed7
b868160
 
4dd1d1c
b868160
4dd1d1c
 
b868160
4dd1d1c
 
b868160
4dd1d1c
b868160
 
4dd1d1c
 
4ce93da
4dd1d1c
 
b868160
4dd1d1c
b868160
4ce93da
b868160
a2b8ed7
86e3d75
 
 
 
 
 
 
 
6f31366
86e3d75
 
 
 
6f31366
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86e3d75
cd81c16
 
 
9aadbe4
6ab6544
cd81c16
 
 
9aadbe4
661cd70
9aadbe4
cd81c16
9aadbe4
 
 
cd81c16
 
 
9aadbe4
cd81c16
 
9aadbe4
6ab6544
cd81c16
9aadbe4
cd81c16
 
6ab6544
cd81c16
6ab6544
 
 
9aadbe4
0c771ba
 
 
b868160
0c771ba
 
 
 
 
 
 
 
a2b8ed7
b868160
0c771ba
 
4ce93da
0c771ba
 
a2b8ed7
 
 
4ce93da
 
 
 
a2b8ed7
4ce93da
 
a2b8ed7
 
 
0c771ba
4ce93da
 
 
 
befa8d6
 
 
 
 
 
 
 
 
 
 
 
f807ea9
 
befa8d6
 
f807ea9
ca1bf94
f5a443c
 
 
 
 
befa8d6
f807ea9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ce93da
f5a443c
f807ea9
 
befa8d6
f807ea9
 
 
befa8d6
f807ea9
 
befa8d6
f807ea9
 
 
 
4ce93da
f807ea9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
befa8d6
 
f807ea9
 
befa8d6
f807ea9
 
 
befa8d6
 
 
 
 
 
 
536e1c8
befa8d6
 
 
 
 
 
 
 
 
 
4ce93da
befa8d6
 
 
 
 
4ce93da
 
 
 
befa8d6
4ce93da
 
befa8d6
 
 
a2b8ed7
4ce93da
 
 
a2b8ed7
4ce93da
 
 
 
 
 
 
 
 
 
 
 
 
a2b8ed7
4ce93da
 
 
 
a2b8ed7
4ce93da
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
from flask import Flask, jsonify, request, Response, stream_with_context
from flask_cors import CORS
import requests
from bs4 import BeautifulSoup
import os
import re
import urllib.parse
import time
import random
import base64
from io import BytesIO
from googlesearch import search
import logging
import queue
from huggingface_hub import HfApi

# Create a logging filter to suppress socket warnings
class SocketWarningFilter(logging.Filter):
    def filter(self, record):
        return not (record.levelname == 'WARNING' and 'socket.send()' in record.getMessage())

# Create a queue for log messages
log_queue = queue.Queue()

# Custom log handler that puts messages in the queue
class QueueHandler(logging.Handler):
    def emit(self, record):
        log_entry = self.format(record)
        log_queue.put(log_entry)

# Set up logging with the custom handler
logger = logging.getLogger()
queue_handler = QueueHandler()
queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
queue_handler.addFilter(SocketWarningFilter())  # Add the filter to the handler
logger.addHandler(queue_handler)
logger.setLevel(logging.INFO)

# Also add the filter to the root logger to catch all socket warnings
logging.getLogger().addFilter(SocketWarningFilter())

app = Flask(__name__)
# Enable CORS with specific settings
CORS(app, resources={
    r"/*": {
        "origins": "*",
        "methods": ["GET", "POST", "OPTIONS"],
        "allow_headers": ["Content-Type", "Authorization"]
    }
})

def search_images(query, num_images=5):
    # Headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'DNT': '1',
        'Connection': 'keep-alive',
    }

    # Format the query for URL
    formatted_query = urllib.parse.quote(query)

    # Google Images URL
    url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"

    try:
        # Get the HTML content
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()

        # Find all image URLs using regex
        image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)

        # Remove duplicates while preserving order
        image_urls = list(dict.fromkeys(image_urls))

        # Store results
        results = []
        downloaded = 0

        for img_url in image_urls:
            if downloaded >= num_images:
                break

            try:
                # Skip small thumbnails and icons
                if 'gstatic.com' in img_url or 'google.com' in img_url:
                    continue

                # Download image
                img_response = requests.get(img_url, headers=headers, timeout=10)
                img_response.raise_for_status()

                # Check if the response is actually an image
                content_type = img_response.headers.get('Content-Type', '')
                if not content_type.startswith('image/'):
                    continue

                # Convert image to base64
                image_base64 = base64.b64encode(img_response.content).decode('utf-8')

                # Add to results
                results.append({
                    'image_url': img_url,
                    'base64_data': f"data:{content_type};base64,{image_base64}"
                })

                downloaded += 1

                # Add a random delay between downloads
                time.sleep(random.uniform(0.5, 1))

            except Exception as e:
                logger.error(f"Error downloading image: {str(e)}")
                continue

        return results

    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        return []



HF_TOKEN = os.getenv("HF_TOKEN")  # Make sure you set the HF_TOKEN in your environment

@app.route('/restart_space', methods=['POST'])
def api_restart_space():
    """API route to restart a Hugging Face Space."""
    space_id = 'Pamudu13/web-scraper'  
    factory_reboot = request.json.get('factory_reboot', False)  # Optional: Set to True if you want a factory reboot

    if not space_id:
        return jsonify({'error': 'space_id parameter is required'}), 400

    try:
        hfapi = HfApi()

        # Call the restart_space method
        res = hfapi.restart_space(
            space_id,
            token=HF_TOKEN,
            factory_reboot=factory_reboot
        )

        return jsonify({
            'success': True,
            'message': f"Successfully restarted Space: {space_id}",
            'response': res
        }), 200

    except Exception as e:
        return jsonify({
            'success': False,
            'message': f"Error: {str(e)}"
        }), 500    

@app.route('/get_live_space_status', methods=['GET'])
def get_live_space_status():
    """API route to stream live status of a Hugging Face Space."""
    space_id = request.args.get('space_id', 'Pamudu13/web-scraper')  # Default to 'Pamudu13/web-scraper' if not provided

    def generate():
        while True:
            try:
                # Fetch the current runtime status of the Space
                hfapi = HfApi()
                space_runtime = hf_api.get_space_runtime(repo_id=space_id)
                
                # Extract relevant details
                status = space_runtime.stage  # e.g., 'BUILDING', 'RUNNING', etc.
                hardware = space_runtime.hardware  # e.g., 'cpu-basic', 't4-medium', etc.
                
                # Send the status as a Server-Sent Event
                yield f"data: {status}\n\n"
                yield f"data: {hardware}\n\n"

                # Delay before checking the status again
                time.sleep(5)  # Adjust polling interval as needed

            except Exception as e:
                # Handle errors and send an error message
                yield f"data: Error: {str(e)}\n\n"
                break  # Stop the stream in case of an error

    return Response(stream_with_context(generate()), mimetype='text/event-stream')




@app.route('/search_images', methods=['GET'])
def api_search_images():
    try:
        # Get query parameters
        query = request.args.get('query', '')
        num_images = int(request.args.get('num_images', 5))

        if not query:
            return jsonify({'error': 'Query parameter is required'}), 400

        if num_images < 1 or num_images > 20:
            return jsonify({'error': 'Number of images must be between 1 and 20'}), 400

        # Search for images
        results = search_images(query, num_images)

        response = jsonify({
            'success': True,
            'query': query,
            'results': results
        })

        # Add CORS headers
        response.headers['Access-Control-Allow-Origin'] = '*'
        return response

    except Exception as e:
        logger.error(f"Error in search_images: {str(e)}")
        response = jsonify({
            'success': False,
            'error': str(e)
        }), 500

        # Add CORS headers
        response.headers['Access-Control-Allow-Origin'] = '*'
        return response

def scrape_site_content(query, num_sites=5):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'DNT': '1',
        'Connection': 'keep-alive',
    }

    results = []
    scraped = 0
    retries = 2  # Number of retries per URL
    timeout = 5  # Reduced timeout to 5 seconds

    try:
        # Get more URLs than needed to account for failures
        search_results = list(search(query, num_results=num_sites * 2))

        # Process each found URL
        for url in search_results:
            if scraped >= num_sites:
                break

            success = False
            for attempt in range(retries):
                try:
                    # Get the HTML content
                    logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
                    logger.info(f"Scraping URL: {url}")
                    response = requests.get(
                        url,
                        headers=headers,
                        timeout=timeout,
                        verify=False  # Skip SSL verification
                    )
                    response.raise_for_status()

                    # Verify it's HTML content
                    content_type = response.headers.get('Content-Type', '').lower()
                    if 'text/html' not in content_type:
                        logger.info(f"Skipping {url} - not HTML content")
                        break

                    # Parse the HTML content
                    soup = BeautifulSoup(response.text, 'html.parser')

                    # Remove script and style elements
                    for script in soup(["script", "style"]):
                        script.decompose()

                    # Extract text content (limit to first 10000 characters)
                    text_content = soup.get_text(separator='\n', strip=True)[:10000]

                    # Skip if not enough content
                    if len(text_content.split()) < 100:  # Skip if less than 100 words
                        logger.info(f"Skipping {url} - not enough content")
                        break

                    # Extract all links (limit to first 10)
                    links = []
                    for link in soup.find_all('a', href=True)[:10]:
                        href = link['href']
                        if href.startswith('http'):
                            links.append({
                                'text': link.get_text(strip=True),
                                'url': href
                            })

                    # Extract meta information
                    title = soup.title.string if soup.title else ''
                    meta_description = ''
                    meta_keywords = ''

                    meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
                    if meta_desc_tag:
                        meta_description = meta_desc_tag.get('content', '')

                    meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
                    if meta_keywords_tag:
                        meta_keywords = meta_keywords_tag.get('content', '')

                    results.append({
                        'url': url,
                        'title': title,
                        'meta_description': meta_description,
                        'meta_keywords': meta_keywords,
                        'text_content': text_content,
                        'links': links
                    })

                    scraped += 1
                    success = True
                    # Add a random delay between scrapes
                    time.sleep(random.uniform(0.5, 1))
                    break  # Break retry loop on success

                except requests.Timeout:
                    print(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
                    if attempt == retries - 1:  # Last attempt
                        print(f"Skipping {url} after {retries} timeout attempts")
                except requests.RequestException as e:
                    print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
                    if attempt == retries - 1:  # Last attempt
                        print(f"Skipping {url} after {retries} failed attempts")

                # Add a longer delay between retries
                if not success and attempt < retries - 1:
                    time.sleep(random.uniform(1, 2))

            # If we haven't found enough valid content and have more URLs, continue
            if scraped < num_sites and len(results) < len(search_results):
                continue

        return results

    except Exception as e:
        print(f"Error in search/scraping process: {str(e)}")
        # Return whatever results we've managed to gather
        return results


@app.route('/scrape_sites', methods=['GET'])
def api_scrape_sites():
    try:
        # Get query parameters
        query = request.args.get('query', '')
        num_sites = int(request.args.get('num_sites', 10))

        if not query:
            return jsonify({'error': 'Query parameter is required'}), 400

        if num_sites < 1 or num_sites > 20:
            return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400

        # Scrape the websites
        results = scrape_site_content(query, num_sites)

        response = jsonify({
            'success': True,
            'query': query,
            'results': results
        })

        # Add CORS headers
        response.headers['Access-Control-Allow-Origin'] = '*'
        return response

    except Exception as e:
        logger.error(f"Error in api_scrape_sites: {str(e)}")
        response = jsonify({
            'success': False,
            'error': str(e)
        }), 500

        # Add CORS headers
        response.headers['Access-Control-Allow-Origin'] = '*'
        return response

@app.route('/logs/stream')
def stream_logs():
    def generate():
        while True:
            try:
                # Get log message from queue, timeout after 1 second
                log_message = log_queue.get(timeout=1)
                yield f"data: {log_message}\n\n"
            except queue.Empty:
                # Send a heartbeat to keep the connection alive
                yield "data: heartbeat\n\n"
            except GeneratorExit:
                break

    response = Response(stream_with_context(generate()), mimetype='text/event-stream')
    response.headers['Cache-Control'] = 'no-cache'
    response.headers['Connection'] = 'keep-alive'
    return response

if __name__ == '__main__':
    logger.info("Starting Flask API server...")
    app.run(host='0.0.0.0', port=5001, debug=True)