import os import requests from bs4 import BeautifulSoup from urllib.parse import urljoin from urllib.request import urlretrieve from tqdm.auto import tqdm from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By import time # Function to fetch and parse the webpage def fetch_images(url, cls_name): options = Options() options.add_argument("--headless") options.add_argument("window-size=1400,1500") options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") options.add_argument("start-maximized") options.add_argument("enable-automation") options.add_argument("--disable-infobars") options.add_argument("--disable-dev-shm-usage") user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" options.add_argument(f"user-agent={user_agent}") driver = webdriver.Chrome(options=options) driver.get(url) time.sleep(5) img_elements = driver.find_elements(By.CLASS_NAME, cls_name) urls = [] for img in img_elements: src_url = img.get_attribute('src') urls.append(src_url) return urls # Function to download images def download_images(img_urls, download_folder='downloaded_images', desc=""): if not os.path.exists(download_folder): os.makedirs(download_folder) for img_url in tqdm(img_urls, desc=desc): fname = img_url.split('/')[-1] fname = fname.split('?')[0] filename = os.path.join(download_folder, fname) urlretrieve(img_url, filename) print(f"Downloaded: {filename}") # Main function to coordinate the image crawling def main(): page_n=0 while True: page_n += 1 url = f'https://www.freepik.com/search?ai=only&format=search&last_filter=page&last_value={page_n}&page={page_n}&query=people%2C+political' url = f'https://app.all-images.ai/en/f/images/search?pageIndex={page_n}&pageSize=40&search=man' cls_name = "_1286nb17" download_path = "/truemedia-eval/crawled-fakes/images/fakes" try: img_urls = fetch_images(url, cls_name) if len(img_urls) == 0: break download_images(img_urls, download_path, desc=f"Page {page_n}") except: break if __name__ == "__main__": main()