File size: 5,556 Bytes
2519bba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import requests
import os
import random
import hashlib
import json

user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.1234.56 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/101.0.1234.56 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/101.0.1234.56",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/14.1.2",
]

def fetch_image_data(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 5):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    wd.get(search_url.format(q=query))

    image_data_list = []

    image_count = 0
    results_start = 0

    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # Get all image thumbnail results
        thumbnail_results = wd.find_elements(By.CLASS_NAME, "Q4LuWd")
        number_results = len(thumbnail_results)

        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        done = False
        for img in thumbnail_results[results_start:number_results]:
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # Extract image data: URL, title, and dimensions
            actual_images = wd.find_elements(By.CLASS_NAME, 'pT0Scc')
            for actual_image in actual_images:
                print("ACTUAL IMAGE: ", actual_image)
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_url = actual_image.get_attribute('src')

                    response = requests.get(image_url)
                    if response.status_code == 200:
                        image_title = actual_image.get_attribute('alt')

                        # Find the parent <a> tag of the image for the page URL
                        parent_a_tag = actual_image.find_element(By.XPATH, './ancestor::a')

                        # Get the page URL directly from the parent <a> tag
                        image_page_url = parent_a_tag.get_attribute('href')

                        # Create a folder for the specific query if it doesn't exist
                        query_folder = os.path.join('images', query)
                        if not os.path.exists(query_folder):
                            os.makedirs(query_folder)

                        # Generate a unique file name using the URL hash
                        file_name = hashlib.sha1(image_url.encode()).hexdigest()[:10]

                        # Create a file path with the .jpg extension
                        file_path = os.path.join(query_folder, f"{file_name}.jpg")
                        # id = id.split('/')[-1]
                        # Save the image
                        with open(file_path, 'wb') as f:
                            f.write(response.content)

                        print(f"SUCCESS - saved {image_url} - as {file_path}")

                        # Store the metadata in the list
                        image_data_list.append({
                            "url": image_url,
                            "title": image_title,
                            "page_url": image_page_url,
                            "Id": file_name
                        })

                        image_count += 1  # Increment the image count

                        if image_count >= max_links_to_fetch:
                            print(f"Found: {len(image_data_list)} images, done!")
                            done = True
                            break  # Exit the loop
                if done:
                    break
            if done:
                break

        # Move the result start point further down
        results_start = len(thumbnail_results)

    return image_data_list

if __name__ == '__main__':
    # Select a random user agent
    selected_user_agent = random.choice(user_agents)

    # Set the user agent for Edge driver
    options = webdriver.EdgeOptions()
    options.add_argument(f'user-agent={selected_user_agent}')

    # Initialize the Edge driver with the specified user agent
    wd = webdriver.Edge(options=options)

    queries = ["Elon Musk", "Barack Obama", "Taylor Swift", "Bill Gates", "Eminem"]  # change your set of queries here

    for query in queries:
        num_of_images = 20
        wd.get('https://google.com')
        search_box = wd.find_element(By.NAME, 'q')
        search_box.send_keys(query)
        image_data_list = fetch_image_data(query, num_of_images, wd)

        # Create a dictionary to store the image data
        query_image_data = {
            "query": query,
            "images": image_data_list
        }

        # Serialize the image data dictionary to JSON
        json_data = json.dumps(query_image_data, indent=4)

        # Save the JSON data to a file with the query name
        json_filename = f"{query}.json"
        with open(json_filename, 'w') as json_file:
            json_file.write(json_data)

    wd.quit()