Spaces:
Runtime error
Runtime error
import selenium | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
import time | |
import requests | |
import os | |
import random | |
import hashlib | |
import json | |
user_agents = [ | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.1234.56 Safari/537.36", | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/101.0.1234.56 Safari/537.36", | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/101.0.1234.56", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/14.1.2", | |
] | |
def fetch_image_data(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 5): | |
def scroll_to_end(wd): | |
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
time.sleep(sleep_between_interactions) | |
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" | |
wd.get(search_url.format(q=query)) | |
image_data_list = [] | |
image_count = 0 | |
results_start = 0 | |
while image_count < max_links_to_fetch: | |
scroll_to_end(wd) | |
# Get all image thumbnail results | |
thumbnail_results = wd.find_elements(By.CLASS_NAME, "Q4LuWd") | |
number_results = len(thumbnail_results) | |
print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}") | |
done = False | |
for img in thumbnail_results[results_start:number_results]: | |
try: | |
img.click() | |
time.sleep(sleep_between_interactions) | |
except Exception: | |
continue | |
# Extract image data: URL, title, and dimensions | |
actual_images = wd.find_elements(By.CLASS_NAME, 'pT0Scc') | |
for actual_image in actual_images: | |
print("ACTUAL IMAGE: ", actual_image) | |
if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'): | |
image_url = actual_image.get_attribute('src') | |
response = requests.get(image_url) | |
if response.status_code == 200: | |
image_title = actual_image.get_attribute('alt') | |
# Find the parent <a> tag of the image for the page URL | |
parent_a_tag = actual_image.find_element(By.XPATH, './ancestor::a') | |
# Get the page URL directly from the parent <a> tag | |
image_page_url = parent_a_tag.get_attribute('href') | |
# Create a folder for the specific query if it doesn't exist | |
query_folder = os.path.join('images', query) | |
if not os.path.exists(query_folder): | |
os.makedirs(query_folder) | |
# Generate a unique file name using the URL hash | |
file_name = hashlib.sha1(image_url.encode()).hexdigest()[:10] | |
# Create a file path with the .jpg extension | |
file_path = os.path.join(query_folder, f"{file_name}.jpg") | |
# id = id.split('/')[-1] | |
# Save the image | |
with open(file_path, 'wb') as f: | |
f.write(response.content) | |
print(f"SUCCESS - saved {image_url} - as {file_path}") | |
# Store the metadata in the list | |
image_data_list.append({ | |
"url": image_url, | |
"title": image_title, | |
"page_url": image_page_url, | |
"Id": file_name | |
}) | |
image_count += 1 # Increment the image count | |
if image_count >= max_links_to_fetch: | |
print(f"Found: {len(image_data_list)} images, done!") | |
done = True | |
break # Exit the loop | |
if done: | |
break | |
if done: | |
break | |
# Move the result start point further down | |
results_start = len(thumbnail_results) | |
return image_data_list | |
if __name__ == '__main__': | |
# Select a random user agent | |
selected_user_agent = random.choice(user_agents) | |
# Set the user agent for Edge driver | |
options = webdriver.EdgeOptions() | |
options.add_argument(f'user-agent={selected_user_agent}') | |
# Initialize the Edge driver with the specified user agent | |
wd = webdriver.Edge(options=options) | |
queries = ["Elon Musk", "Barack Obama", "Taylor Swift", "Bill Gates", "Eminem"] # change your set of queries here | |
for query in queries: | |
num_of_images = 20 | |
wd.get('https://google.com') | |
search_box = wd.find_element(By.NAME, 'q') | |
search_box.send_keys(query) | |
image_data_list = fetch_image_data(query, num_of_images, wd) | |
# Create a dictionary to store the image data | |
query_image_data = { | |
"query": query, | |
"images": image_data_list | |
} | |
# Serialize the image data dictionary to JSON | |
json_data = json.dumps(query_image_data, indent=4) | |
# Save the JSON data to a file with the query name | |
json_filename = f"{query}.json" | |
with open(json_filename, 'w') as json_file: | |
json_file.write(json_data) | |
wd.quit() | |