Spaces:

TheFrenchDemos
/

genai-reward

Running

App Files Files Community

genai-reward / count_authors.py

TheFrenchDemos

Initial commit

9705a2a 5 months ago

raw

history blame contribute delete

2.96 kB

	import tqdm
	from multiprocessing import Pool, cpu_count
	import signal
	import sys
	import time

	from flickrapi import FlickrAPI

	# Add Flickr configuration
	FLICKR_API_KEY = '80ef21a6f7eb0984ea613c316a89ca69'
	FLICKR_API_SECRET = '4d0e8ce6734f4b3f'
	flickr = FlickrAPI(FLICKR_API_KEY, FLICKR_API_SECRET, format='parsed-json', store_token=False)

	def get_photo_id(url):
	"""Extract photo ID from Flickr URL"""
	try:
	return url.split('/')[-1].split('_')[0]
	except:
	return None

	def get_other_info(url):
	"""Get author information from Flickr"""
	try:
	photo_id = get_photo_id(url)
	if photo_id:
	# wait for 0.1 second
	time.sleep(0.1)
	photo_info = flickr.photos.getInfo(photo_id=photo_id)
	license = photo_info['photo']['license']
	owner = photo_info['photo']['owner']
	flickr_url = f"https://www.flickr.com/photos/{owner.get('nsid', '')}/{photo_id}"
	return {
	'username': owner.get('username', ''),
	'realname': owner.get('realname', ''),
	'nsid': owner.get('nsid', ''),
	'flickr_url': flickr_url,
	'license': license
	}
	except:
	pass
	return {
	'username': 'Unknown',
	'realname': 'Unknown',
	'nsid': '',
	'flickr_url': '',
	'license': 'Unknown'
	}

	def init_worker():
	"""Initialize worker process to handle signals"""
	signal.signal(signal.SIGINT, signal.SIG_IGN)

	def process_url(url):
	try:
	return get_other_info(url)
	except Exception as e:
	return {
	'username': 'Error',
	'realname': str(e),
	'nsid': '',
	'flickr_url': url,
	'license': 'Unknown'
	}

	def process_urls_in_chunks(urls, chunk_size=100000):
	authors = []
	with Pool(cpu_count(), initializer=init_worker) as pool:
	try:
	# Process URLs in chunks
	for i in range(0, len(urls), chunk_size):
	chunk = urls[i:i + chunk_size]
	chunk_results = list(tqdm.tqdm(
	pool.imap(process_url, chunk),
	total=len(chunk),
	desc=f"Processing chunk {i//chunk_size + 1}"
	))
	authors.extend(chunk_results)
	except KeyboardInterrupt:
	pool.terminate()
	pool.join()
	print("\nProcessing interrupted by user")
	sys.exit(1)
	return authors

	if __name__ == "__main__":
	urls_file = "data/openimages_urls.txt"
	with open(urls_file) as f:
	urls = [url.strip() for url in f.readlines()][:100000]

	authors = process_urls_in_chunks(urls)

	# Count unique authors
	unique_authors = len(set([author['username'] for author in authors]))
	print(f"unique_authors: {unique_authors}")
	print(f"Number of unique authors: {unique_authors}")