import tqdm from multiprocessing import Pool, cpu_count import signal import sys import time from flickrapi import FlickrAPI # Add Flickr configuration FLICKR_API_KEY = '80ef21a6f7eb0984ea613c316a89ca69' FLICKR_API_SECRET = '4d0e8ce6734f4b3f' flickr = FlickrAPI(FLICKR_API_KEY, FLICKR_API_SECRET, format='parsed-json', store_token=False) def get_photo_id(url): """Extract photo ID from Flickr URL""" try: return url.split('/')[-1].split('_')[0] except: return None def get_other_info(url): """Get author information from Flickr""" try: photo_id = get_photo_id(url) if photo_id: # wait for 0.1 second time.sleep(0.1) photo_info = flickr.photos.getInfo(photo_id=photo_id) license = photo_info['photo']['license'] owner = photo_info['photo']['owner'] flickr_url = f"https://www.flickr.com/photos/{owner.get('nsid', '')}/{photo_id}" return { 'username': owner.get('username', ''), 'realname': owner.get('realname', ''), 'nsid': owner.get('nsid', ''), 'flickr_url': flickr_url, 'license': license } except: pass return { 'username': 'Unknown', 'realname': 'Unknown', 'nsid': '', 'flickr_url': '', 'license': 'Unknown' } def init_worker(): """Initialize worker process to handle signals""" signal.signal(signal.SIGINT, signal.SIG_IGN) def process_url(url): try: return get_other_info(url) except Exception as e: return { 'username': 'Error', 'realname': str(e), 'nsid': '', 'flickr_url': url, 'license': 'Unknown' } def process_urls_in_chunks(urls, chunk_size=100000): authors = [] with Pool(cpu_count(), initializer=init_worker) as pool: try: # Process URLs in chunks for i in range(0, len(urls), chunk_size): chunk = urls[i:i + chunk_size] chunk_results = list(tqdm.tqdm( pool.imap(process_url, chunk), total=len(chunk), desc=f"Processing chunk {i//chunk_size + 1}" )) authors.extend(chunk_results) except KeyboardInterrupt: pool.terminate() pool.join() print("\nProcessing interrupted by user") sys.exit(1) return authors if __name__ == "__main__": urls_file = "data/openimages_urls.txt" with open(urls_file) as f: urls = [url.strip() for url in f.readlines()][:100000] authors = process_urls_in_chunks(urls) # Count unique authors unique_authors = len(set([author['username'] for author in authors])) print(f"unique_authors: {unique_authors}") print(f"Number of unique authors: {unique_authors}")