Spaces:

mdirshad09
/

FaceNet

Runtime error

File size: 5,556 Bytes

2519bba

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import requests
import os
import random
import hashlib
import json

user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.1234.56 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/101.0.1234.56 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/101.0.1234.56",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/14.1.2",
]

def fetch_image_data(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 5):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    wd.get(search_url.format(q=query))

    image_data_list = []

    image_count = 0
    results_start = 0

    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # Get all image thumbnail results
        thumbnail_results = wd.find_elements(By.CLASS_NAME, "Q4LuWd")
        number_results = len(thumbnail_results)

        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        done = False
        for img in thumbnail_results[results_start:number_results]:
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # Extract image data: URL, title, and dimensions
            actual_images = wd.find_elements(By.CLASS_NAME, 'pT0Scc')
            for actual_image in actual_images:
                print("ACTUAL IMAGE: ", actual_image)
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_url = actual_image.get_attribute('src')

                    response = requests.get(image_url)
                    if response.status_code == 200:
                        image_title = actual_image.get_attribute('alt')

                        # Find the parent <a> tag of the image for the page URL
                        parent_a_tag = actual_image.find_element(By.XPATH, './ancestor::a')

                        # Get the page URL directly from the parent <a> tag
                        image_page_url = parent_a_tag.get_attribute('href')

                        # Create a folder for the specific query if it doesn't exist
                        query_folder = os.path.join('images', query)
                        if not os.path.exists(query_folder):
                            os.makedirs(query_folder)

                        # Generate a unique file name using the URL hash
                        file_name = hashlib.sha1(image_url.encode()).hexdigest()[:10]

                        # Create a file path with the .jpg extension
                        file_path = os.path.join(query_folder, f"{file_name}.jpg")
                        # id = id.split('/')[-1]
                        # Save the image
                        with open(file_path, 'wb') as f:
                            f.write(response.content)

                        print(f"SUCCESS - saved {image_url} - as {file_path}")

                        # Store the metadata in the list
                        image_data_list.append({
                            "url": image_url,
                            "title": image_title,
                            "page_url": image_page_url,
                            "Id": file_name
                        })

                        image_count += 1  # Increment the image count

                        if image_count >= max_links_to_fetch:
                            print(f"Found: {len(image_data_list)} images, done!")
                            done = True
                            break  # Exit the loop
                if done:
                    break
            if done:
                break

        # Move the result start point further down
        results_start = len(thumbnail_results)

    return image_data_list

if __name__ == '__main__':
    # Select a random user agent
    selected_user_agent = random.choice(user_agents)

    # Set the user agent for Edge driver
    options = webdriver.EdgeOptions()
    options.add_argument(f'user-agent={selected_user_agent}')

    # Initialize the Edge driver with the specified user agent
    wd = webdriver.Edge(options=options)

    queries = ["Elon Musk", "Barack Obama", "Taylor Swift", "Bill Gates", "Eminem"]  # change your set of queries here

    for query in queries:
        num_of_images = 20
        wd.get('https://google.com')
        search_box = wd.find_element(By.NAME, 'q')
        search_box.send_keys(query)
        image_data_list = fetch_image_data(query, num_of_images, wd)

        # Create a dictionary to store the image data
        query_image_data = {
            "query": query,
            "images": image_data_list
        }

        # Serialize the image data dictionary to JSON
        json_data = json.dumps(query_image_data, indent=4)

        # Save the JSON data to a file with the query name
        json_filename = f"{query}.json"
        with open(json_filename, 'w') as json_file:
            json_file.write(json_data)

    wd.quit()