Spaces:
Running
Running
import tqdm | |
from multiprocessing import Pool, cpu_count | |
import signal | |
import sys | |
import time | |
from flickrapi import FlickrAPI | |
# Add Flickr configuration | |
FLICKR_API_KEY = '80ef21a6f7eb0984ea613c316a89ca69' | |
FLICKR_API_SECRET = '4d0e8ce6734f4b3f' | |
flickr = FlickrAPI(FLICKR_API_KEY, FLICKR_API_SECRET, format='parsed-json', store_token=False) | |
def get_photo_id(url): | |
"""Extract photo ID from Flickr URL""" | |
try: | |
return url.split('/')[-1].split('_')[0] | |
except: | |
return None | |
def get_other_info(url): | |
"""Get author information from Flickr""" | |
try: | |
photo_id = get_photo_id(url) | |
if photo_id: | |
# wait for 0.1 second | |
time.sleep(0.1) | |
photo_info = flickr.photos.getInfo(photo_id=photo_id) | |
license = photo_info['photo']['license'] | |
owner = photo_info['photo']['owner'] | |
flickr_url = f"https://www.flickr.com/photos/{owner.get('nsid', '')}/{photo_id}" | |
return { | |
'username': owner.get('username', ''), | |
'realname': owner.get('realname', ''), | |
'nsid': owner.get('nsid', ''), | |
'flickr_url': flickr_url, | |
'license': license | |
} | |
except: | |
pass | |
return { | |
'username': 'Unknown', | |
'realname': 'Unknown', | |
'nsid': '', | |
'flickr_url': '', | |
'license': 'Unknown' | |
} | |
def init_worker(): | |
"""Initialize worker process to handle signals""" | |
signal.signal(signal.SIGINT, signal.SIG_IGN) | |
def process_url(url): | |
try: | |
return get_other_info(url) | |
except Exception as e: | |
return { | |
'username': 'Error', | |
'realname': str(e), | |
'nsid': '', | |
'flickr_url': url, | |
'license': 'Unknown' | |
} | |
def process_urls_in_chunks(urls, chunk_size=100000): | |
authors = [] | |
with Pool(cpu_count(), initializer=init_worker) as pool: | |
try: | |
# Process URLs in chunks | |
for i in range(0, len(urls), chunk_size): | |
chunk = urls[i:i + chunk_size] | |
chunk_results = list(tqdm.tqdm( | |
pool.imap(process_url, chunk), | |
total=len(chunk), | |
desc=f"Processing chunk {i//chunk_size + 1}" | |
)) | |
authors.extend(chunk_results) | |
except KeyboardInterrupt: | |
pool.terminate() | |
pool.join() | |
print("\nProcessing interrupted by user") | |
sys.exit(1) | |
return authors | |
if __name__ == "__main__": | |
urls_file = "data/openimages_urls.txt" | |
with open(urls_file) as f: | |
urls = [url.strip() for url in f.readlines()][:100000] | |
authors = process_urls_in_chunks(urls) | |
# Count unique authors | |
unique_authors = len(set([author['username'] for author in authors])) | |
print(f"unique_authors: {unique_authors}") | |
print(f"Number of unique authors: {unique_authors}") | |