import pandas as pd
import requests
import os

file_path = "open-images-dataset-train0.tsv"

# Read TSV file, skipping the first row
df = pd.read_csv(file_path, sep="\t", engine="python", skiprows=1, names=["ImageURL", "Subset", "ImageID"])

# Print first few rows to verify
print("First few rows of the cleaned dataset:")
print(df.head())

# Create a fixed category folder (since 'Subset' contains numbers, not real categories)
output_folder = "open_images_v7/dataset"
os.makedirs(output_folder, exist_ok=True)

# Limit downloads to the first 100 images
max_images = 100

for index, row in df.iterrows():
    if index >= max_images:
        break  # Stop downloading after 100 images

    image_url = row["ImageURL"]
    image_id = row["ImageID"]

    # Ensure the image filename ends with ".jpg"
    image_path = os.path.join(output_folder, f"{image_id}.jpg")

    try:
        response = requests.get(image_url, timeout=10)
        if response.status_code == 200:
            with open(image_path, "wb") as f:
                f.write(response.content)
            print(f"✅ Downloaded: {image_id}.jpg")
        else:
            print(f"❌ Failed: {image_id}")
    except Exception as e:
        print(f"❌ Error downloading {image_id}: {e}")