Spaces:
Runtime error
Runtime error
File size: 5,556 Bytes
2519bba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import requests
import os
import random
import hashlib
import json
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.1234.56 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/101.0.1234.56 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/101.0.1234.56",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/14.1.2",
]
def fetch_image_data(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 5):
def scroll_to_end(wd):
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(sleep_between_interactions)
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
wd.get(search_url.format(q=query))
image_data_list = []
image_count = 0
results_start = 0
while image_count < max_links_to_fetch:
scroll_to_end(wd)
# Get all image thumbnail results
thumbnail_results = wd.find_elements(By.CLASS_NAME, "Q4LuWd")
number_results = len(thumbnail_results)
print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
done = False
for img in thumbnail_results[results_start:number_results]:
try:
img.click()
time.sleep(sleep_between_interactions)
except Exception:
continue
# Extract image data: URL, title, and dimensions
actual_images = wd.find_elements(By.CLASS_NAME, 'pT0Scc')
for actual_image in actual_images:
print("ACTUAL IMAGE: ", actual_image)
if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
image_url = actual_image.get_attribute('src')
response = requests.get(image_url)
if response.status_code == 200:
image_title = actual_image.get_attribute('alt')
# Find the parent <a> tag of the image for the page URL
parent_a_tag = actual_image.find_element(By.XPATH, './ancestor::a')
# Get the page URL directly from the parent <a> tag
image_page_url = parent_a_tag.get_attribute('href')
# Create a folder for the specific query if it doesn't exist
query_folder = os.path.join('images', query)
if not os.path.exists(query_folder):
os.makedirs(query_folder)
# Generate a unique file name using the URL hash
file_name = hashlib.sha1(image_url.encode()).hexdigest()[:10]
# Create a file path with the .jpg extension
file_path = os.path.join(query_folder, f"{file_name}.jpg")
# id = id.split('/')[-1]
# Save the image
with open(file_path, 'wb') as f:
f.write(response.content)
print(f"SUCCESS - saved {image_url} - as {file_path}")
# Store the metadata in the list
image_data_list.append({
"url": image_url,
"title": image_title,
"page_url": image_page_url,
"Id": file_name
})
image_count += 1 # Increment the image count
if image_count >= max_links_to_fetch:
print(f"Found: {len(image_data_list)} images, done!")
done = True
break # Exit the loop
if done:
break
if done:
break
# Move the result start point further down
results_start = len(thumbnail_results)
return image_data_list
if __name__ == '__main__':
# Select a random user agent
selected_user_agent = random.choice(user_agents)
# Set the user agent for Edge driver
options = webdriver.EdgeOptions()
options.add_argument(f'user-agent={selected_user_agent}')
# Initialize the Edge driver with the specified user agent
wd = webdriver.Edge(options=options)
queries = ["Elon Musk", "Barack Obama", "Taylor Swift", "Bill Gates", "Eminem"] # change your set of queries here
for query in queries:
num_of_images = 20
wd.get('https://google.com')
search_box = wd.find_element(By.NAME, 'q')
search_box.send_keys(query)
image_data_list = fetch_image_data(query, num_of_images, wd)
# Create a dictionary to store the image data
query_image_data = {
"query": query,
"images": image_data_list
}
# Serialize the image data dictionary to JSON
json_data = json.dumps(query_image_data, indent=4)
# Save the JSON data to a file with the query name
json_filename = f"{query}.json"
with open(json_filename, 'w') as json_file:
json_file.write(json_data)
wd.quit()
|