Spaces:

mgyigit
/

misinfo

Sleeping

App Files Files Community

misinfo / src /data_loader /download_images.py

gyigit

update

54e8a79 3 months ago

raw

history blame

5.48 kB

	import os
	import argparse
	import pandas as pd
	import requests
	import json
	import io
	from tqdm import tqdm
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from collections import defaultdict
	from PIL import Image

	from src.utils.data_utils import HEADERS
	from src.utils.path_utils import get_project_root

	# Constants
	PROJECT_ROOT = get_project_root()
	EXTRACTION_DIR = str(PROJECT_ROOT / "data/raw/factify/extracted")
	IMAGES_DIR = os.path.join(EXTRACTION_DIR, "images")


	def ensure_directories(images_folder):
	"""Ensure the image directory exists."""
	os.makedirs(images_folder, exist_ok=True)


	def download_image(url, save_path):
	"""Download a single image if not already downloaded."""
	# Check if the image already exists
	if os.path.exists(save_path):
	print(f"Image already exists: {save_path}")
	return True

	headers = {
	"User-Agent": (
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) "
	"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
	)
	}
	try:
	response = requests.get(url, headers=headers, stream=True, timeout=30)
	response.raise_for_status() # Raise an error for HTTP issues
	img = Image.open(io.BytesIO(response.content))
	img = img.convert("RGB") # Ensure the image is in RGB format
	img.save(save_path)
	print(f"Downloaded and saved image: {save_path}")
	return True
	except Exception as e:
	print(f"Failed to download image from {url}: {e}")
	return False


	def process_image(row, images_folder, stats, dataset_name):
	"""Process claim and evidence image downloads."""
	file_id = str(row["id"])
	category = row.get("category", "Unknown")
	claim_image_url = row.get("claim_image", "")
	evidence_image_url = row.get("evidence_image", "")

	# Ensure category stats exist
	stats["categories"].setdefault(
	category,
	{
	"total_claim": 0,
	"successful_claim": 0,
	"total_evidence": 0,
	"successful_evidence": 0,
	},
	)
	stats["categories"][category]["total_claim"] += 1
	stats["categories"][category]["total_evidence"] += 1

	# Download claim image
	if claim_image_url:
	success = download_image(
	claim_image_url, os.path.join(images_folder, f"{file_id}_claim.jpg")
	)
	if success:
	stats["successful_claim"] += 1
	stats["categories"][category]["successful_claim"] += 1

	# Download evidence image
	if evidence_image_url:
	success = download_image(
	evidence_image_url, os.path.join(images_folder, f"{file_id}_evidence.jpg")
	)
	if success:
	stats["successful_evidence"] += 1
	stats["categories"][category]["successful_evidence"] += 1


	def download_images(dataset, use_threading):
	"""Download images for the specified dataset (train or test)."""
	csv_path = os.path.join(EXTRACTION_DIR, f"{dataset}.csv")
	images_folder = os.path.join(IMAGES_DIR, dataset)
	stats_file_path = os.path.join(
	EXTRACTION_DIR, f"{dataset}_image_download_stats.json"
	)
	ensure_directories(images_folder)

	if not os.path.exists(csv_path):
	print(f"CSV file not found for {dataset}: {csv_path}")
	return

	stats = {
	"successful_claim": 0,
	"successful_evidence": 0,
	"categories": defaultdict(
	lambda: {
	"total_claim": 0,
	"successful_claim": 0,
	"total_evidence": 0,
	"successful_evidence": 0,
	}
	),
	}

	df = pd.read_csv(csv_path, names=HEADERS, header=None, sep="\t", skiprows=1)

	if use_threading:
	with ThreadPoolExecutor(max_workers=10) as executor:
	futures = [
	executor.submit(process_image, row, images_folder, stats, dataset)
	for _, row in df.iterrows()
	]
	for _ in tqdm(
	as_completed(futures),
	total=len(futures),
	desc=f"Downloading {dataset} images",
	):
	pass
	else:
	for _, row in tqdm(
	df.iterrows(), total=len(df), desc=f"Downloading {dataset} images"
	):
	process_image(row, images_folder, stats, dataset)

	with open(stats_file_path, "w") as stats_file:
	json.dump(stats, stats_file, indent=4)
	print(f"Image download stats saved to {stats_file_path}")


	def main():
	parser = argparse.ArgumentParser(description="Download images for Factify dataset.")
	parser.add_argument(
	"--dataset",
	choices=["train", "test"],
	help="Specify which dataset to download images for (train or test). If not specified, both will be downloaded.",
	)
	parser.add_argument(
	"--use-threading",
	action="store_true",
	default=True,
	help="Enable threading for image downloads (default: True).",
	)
	args = parser.parse_args()

	if args.dataset:
	# Run for the specified dataset
	download_images(args.dataset, args.use_threading)
	else:
	# Run for both train and test if no dataset is specified
	print("No dataset specified. Downloading images for both train and test...")
	for dataset in ["train", "test"]:
	download_images(dataset, args.use_threading)


	if __name__ == "__main__":
	main()