misinfo / src /data_loader /download_images.py
gyigit's picture
update
54e8a79
raw
history blame
5.48 kB
import os
import argparse
import pandas as pd
import requests
import json
import io
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
from PIL import Image
from src.utils.data_utils import HEADERS
from src.utils.path_utils import get_project_root
# Constants
PROJECT_ROOT = get_project_root()
EXTRACTION_DIR = str(PROJECT_ROOT / "data/raw/factify/extracted")
IMAGES_DIR = os.path.join(EXTRACTION_DIR, "images")
def ensure_directories(images_folder):
"""Ensure the image directory exists."""
os.makedirs(images_folder, exist_ok=True)
def download_image(url, save_path):
"""Download a single image if not already downloaded."""
# Check if the image already exists
if os.path.exists(save_path):
print(f"Image already exists: {save_path}")
return True
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
)
}
try:
response = requests.get(url, headers=headers, stream=True, timeout=30)
response.raise_for_status() # Raise an error for HTTP issues
img = Image.open(io.BytesIO(response.content))
img = img.convert("RGB") # Ensure the image is in RGB format
img.save(save_path)
print(f"Downloaded and saved image: {save_path}")
return True
except Exception as e:
print(f"Failed to download image from {url}: {e}")
return False
def process_image(row, images_folder, stats, dataset_name):
"""Process claim and evidence image downloads."""
file_id = str(row["id"])
category = row.get("category", "Unknown")
claim_image_url = row.get("claim_image", "")
evidence_image_url = row.get("evidence_image", "")
# Ensure category stats exist
stats["categories"].setdefault(
category,
{
"total_claim": 0,
"successful_claim": 0,
"total_evidence": 0,
"successful_evidence": 0,
},
)
stats["categories"][category]["total_claim"] += 1
stats["categories"][category]["total_evidence"] += 1
# Download claim image
if claim_image_url:
success = download_image(
claim_image_url, os.path.join(images_folder, f"{file_id}_claim.jpg")
)
if success:
stats["successful_claim"] += 1
stats["categories"][category]["successful_claim"] += 1
# Download evidence image
if evidence_image_url:
success = download_image(
evidence_image_url, os.path.join(images_folder, f"{file_id}_evidence.jpg")
)
if success:
stats["successful_evidence"] += 1
stats["categories"][category]["successful_evidence"] += 1
def download_images(dataset, use_threading):
"""Download images for the specified dataset (train or test)."""
csv_path = os.path.join(EXTRACTION_DIR, f"{dataset}.csv")
images_folder = os.path.join(IMAGES_DIR, dataset)
stats_file_path = os.path.join(
EXTRACTION_DIR, f"{dataset}_image_download_stats.json"
)
ensure_directories(images_folder)
if not os.path.exists(csv_path):
print(f"CSV file not found for {dataset}: {csv_path}")
return
stats = {
"successful_claim": 0,
"successful_evidence": 0,
"categories": defaultdict(
lambda: {
"total_claim": 0,
"successful_claim": 0,
"total_evidence": 0,
"successful_evidence": 0,
}
),
}
df = pd.read_csv(csv_path, names=HEADERS, header=None, sep="\t", skiprows=1)
if use_threading:
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [
executor.submit(process_image, row, images_folder, stats, dataset)
for _, row in df.iterrows()
]
for _ in tqdm(
as_completed(futures),
total=len(futures),
desc=f"Downloading {dataset} images",
):
pass
else:
for _, row in tqdm(
df.iterrows(), total=len(df), desc=f"Downloading {dataset} images"
):
process_image(row, images_folder, stats, dataset)
with open(stats_file_path, "w") as stats_file:
json.dump(stats, stats_file, indent=4)
print(f"Image download stats saved to {stats_file_path}")
def main():
parser = argparse.ArgumentParser(description="Download images for Factify dataset.")
parser.add_argument(
"--dataset",
choices=["train", "test"],
help="Specify which dataset to download images for (train or test). If not specified, both will be downloaded.",
)
parser.add_argument(
"--use-threading",
action="store_true",
default=True,
help="Enable threading for image downloads (default: True).",
)
args = parser.parse_args()
if args.dataset:
# Run for the specified dataset
download_images(args.dataset, args.use_threading)
else:
# Run for both train and test if no dataset is specified
print("No dataset specified. Downloading images for both train and test...")
for dataset in ["train", "test"]:
download_images(dataset, args.use_threading)
if __name__ == "__main__":
main()