import selenium from selenium import webdriver from selenium.webdriver.common.by import By import time import requests import os import random import hashlib import json user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.1234.56 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/101.0.1234.56 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/101.0.1234.56", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/14.1.2", ] def fetch_image_data(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 5): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" wd.get(search_url.format(q=query)) image_data_list = [] image_count = 0 results_start = 0 while image_count < max_links_to_fetch: scroll_to_end(wd) # Get all image thumbnail results thumbnail_results = wd.find_elements(By.CLASS_NAME, "Q4LuWd") number_results = len(thumbnail_results) print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}") done = False for img in thumbnail_results[results_start:number_results]: try: img.click() time.sleep(sleep_between_interactions) except Exception: continue # Extract image data: URL, title, and dimensions actual_images = wd.find_elements(By.CLASS_NAME, 'pT0Scc') for actual_image in actual_images: print("ACTUAL IMAGE: ", actual_image) if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'): image_url = actual_image.get_attribute('src') response = requests.get(image_url) if response.status_code == 200: image_title = actual_image.get_attribute('alt') # Find the parent tag of the image for the page URL parent_a_tag = actual_image.find_element(By.XPATH, './ancestor::a') # Get the page URL directly from the parent tag image_page_url = parent_a_tag.get_attribute('href') # Create a folder for the specific query if it doesn't exist query_folder = os.path.join('images', query) if not os.path.exists(query_folder): os.makedirs(query_folder) # Generate a unique file name using the URL hash file_name = hashlib.sha1(image_url.encode()).hexdigest()[:10] # Create a file path with the .jpg extension file_path = os.path.join(query_folder, f"{file_name}.jpg") # id = id.split('/')[-1] # Save the image with open(file_path, 'wb') as f: f.write(response.content) print(f"SUCCESS - saved {image_url} - as {file_path}") # Store the metadata in the list image_data_list.append({ "url": image_url, "title": image_title, "page_url": image_page_url, "Id": file_name }) image_count += 1 # Increment the image count if image_count >= max_links_to_fetch: print(f"Found: {len(image_data_list)} images, done!") done = True break # Exit the loop if done: break if done: break # Move the result start point further down results_start = len(thumbnail_results) return image_data_list if __name__ == '__main__': # Select a random user agent selected_user_agent = random.choice(user_agents) # Set the user agent for Edge driver options = webdriver.EdgeOptions() options.add_argument(f'user-agent={selected_user_agent}') # Initialize the Edge driver with the specified user agent wd = webdriver.Edge(options=options) queries = ["Elon Musk", "Barack Obama", "Taylor Swift", "Bill Gates", "Eminem"] # change your set of queries here for query in queries: num_of_images = 20 wd.get('https://google.com') search_box = wd.find_element(By.NAME, 'q') search_box.send_keys(query) image_data_list = fetch_image_data(query, num_of_images, wd) # Create a dictionary to store the image data query_image_data = { "query": query, "images": image_data_list } # Serialize the image data dictionary to JSON json_data = json.dumps(query_image_data, indent=4) # Save the JSON data to a file with the query name json_filename = f"{query}.json" with open(json_filename, 'w') as json_file: json_file.write(json_data) wd.quit()