genai-reward / count_authors.py
TheFrenchDemos's picture
Initial commit
9705a2a
import tqdm
from multiprocessing import Pool, cpu_count
import signal
import sys
import time
from flickrapi import FlickrAPI
# Add Flickr configuration
FLICKR_API_KEY = '80ef21a6f7eb0984ea613c316a89ca69'
FLICKR_API_SECRET = '4d0e8ce6734f4b3f'
flickr = FlickrAPI(FLICKR_API_KEY, FLICKR_API_SECRET, format='parsed-json', store_token=False)
def get_photo_id(url):
"""Extract photo ID from Flickr URL"""
try:
return url.split('/')[-1].split('_')[0]
except:
return None
def get_other_info(url):
"""Get author information from Flickr"""
try:
photo_id = get_photo_id(url)
if photo_id:
# wait for 0.1 second
time.sleep(0.1)
photo_info = flickr.photos.getInfo(photo_id=photo_id)
license = photo_info['photo']['license']
owner = photo_info['photo']['owner']
flickr_url = f"https://www.flickr.com/photos/{owner.get('nsid', '')}/{photo_id}"
return {
'username': owner.get('username', ''),
'realname': owner.get('realname', ''),
'nsid': owner.get('nsid', ''),
'flickr_url': flickr_url,
'license': license
}
except:
pass
return {
'username': 'Unknown',
'realname': 'Unknown',
'nsid': '',
'flickr_url': '',
'license': 'Unknown'
}
def init_worker():
"""Initialize worker process to handle signals"""
signal.signal(signal.SIGINT, signal.SIG_IGN)
def process_url(url):
try:
return get_other_info(url)
except Exception as e:
return {
'username': 'Error',
'realname': str(e),
'nsid': '',
'flickr_url': url,
'license': 'Unknown'
}
def process_urls_in_chunks(urls, chunk_size=100000):
authors = []
with Pool(cpu_count(), initializer=init_worker) as pool:
try:
# Process URLs in chunks
for i in range(0, len(urls), chunk_size):
chunk = urls[i:i + chunk_size]
chunk_results = list(tqdm.tqdm(
pool.imap(process_url, chunk),
total=len(chunk),
desc=f"Processing chunk {i//chunk_size + 1}"
))
authors.extend(chunk_results)
except KeyboardInterrupt:
pool.terminate()
pool.join()
print("\nProcessing interrupted by user")
sys.exit(1)
return authors
if __name__ == "__main__":
urls_file = "data/openimages_urls.txt"
with open(urls_file) as f:
urls = [url.strip() for url in f.readlines()][:100000]
authors = process_urls_in_chunks(urls)
# Count unique authors
unique_authors = len(set([author['username'] for author in authors]))
print(f"unique_authors: {unique_authors}")
print(f"Number of unique authors: {unique_authors}")