ai-card-hub / app.py
GotThatData's picture
Update
b489b3f verified
raw
history blame
6.58 kB
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
import os
import gradio as gr
from datasets import load_dataset, Dataset, concatenate_datasets
import pandas as pd
from PIL import Image
from tqdm import tqdm
import logging
import yaml
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Load settings
if not os.path.exists("settings.yaml"):
raise FileNotFoundError("settings.yaml file is missing. Please add it with 'client_secrets_file'.")
with open('settings.yaml', 'r') as file:
settings = yaml.safe_load(file)
# Utility Functions
def safe_load_dataset(dataset_name):
"""Load Hugging Face dataset safely."""
try:
dataset = load_dataset(dataset_name)
return dataset, len(dataset['train']) if 'train' in dataset else 0
except Exception:
logger.info("No existing dataset found. Starting fresh.")
return None, 0
def is_valid_image(file_path):
"""Check if a file is a valid image."""
try:
with Image.open(file_path) as img:
img.verify()
return True
except:
return False
# DatasetManager Class
class DatasetManager:
def __init__(self, local_images_dir="downloaded_cards"):
self.local_images_dir = local_images_dir
self.drive = None
self.dataset_name = "GotThatData/sports-cards"
os.makedirs(local_images_dir, exist_ok=True)
def authenticate_drive(self):
"""Authenticate with Google Drive."""
try:
gauth = GoogleAuth()
gauth.settings['client_config_file'] = settings['client_secrets_file']
# Try to load saved credentials
gauth.LoadCredentialsFile("credentials.txt")
if gauth.credentials is None:
gauth.LocalWebserverAuth()
elif gauth.access_token_expired:
gauth.Refresh()
else:
gauth.Authorize()
gauth.SaveCredentialsFile("credentials.txt")
self.drive = GoogleDrive(gauth)
return True, "Successfully authenticated with Google Drive"
except Exception as e:
return False, f"Authentication failed: {str(e)}"
def download_and_rename_files(self, drive_folder_id, naming_convention):
"""Download files from Google Drive and rename them."""
if not self.drive:
return False, "Google Drive not authenticated", []
try:
query = f"'{drive_folder_id}' in parents and trashed=false"
file_list = self.drive.ListFile({'q': query}).GetList()
if not file_list:
return False, "No files found in the specified folder.", []
existing_dataset, start_index = safe_load_dataset(self.dataset_name)
renamed_files = []
for i, file in enumerate(tqdm(file_list, desc="Downloading files", unit="file")):
if 'mimeType' in file and 'image' in file['mimeType'].lower():
new_filename = f"{naming_convention}_{start_index + i + 1}.jpg"
file_path = os.path.join(self.local_images_dir, new_filename)
file.GetContentFile(file_path)
if is_valid_image(file_path):
renamed_files.append({
'file_path': file_path,
'original_name': file['title'],
'new_name': new_filename
})
logger.info(f"Downloaded and renamed: {file['title']} -> {new_filename}")
else:
logger.error(f"Invalid image detected, removing {file_path}")
os.remove(file_path)
return True, f"Processed {len(renamed_files)} images", renamed_files
except Exception as e:
return False, f"Error during download: {str(e)}", []
def update_huggingface_dataset(self, renamed_files):
"""Update Hugging Face dataset with new images."""
try:
df = pd.DataFrame(renamed_files)
new_dataset = Dataset.from_pandas(df)
existing_dataset, _ = safe_load_dataset(self.dataset_name)
if existing_dataset and 'train' in existing_dataset:
combined_dataset = concatenate_datasets([existing_dataset['train'], new_dataset])
else:
combined_dataset = new_dataset
combined_dataset.push_to_hub(self.dataset_name, split="train")
return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images."
except Exception as e:
return False, f"Error updating Hugging Face dataset: {str(e)}"
# Process Pipeline
def process_pipeline(folder_id, naming_convention):
"""Main pipeline for processing images and updating dataset."""
manager = DatasetManager()
# Step 1: Authenticate Google Drive
auth_success, auth_message = manager.authenticate_drive()
if not auth_success:
return auth_message, []
# Step 2: Download and rename files
success, message, renamed_files = manager.download_and_rename_files(folder_id, naming_convention)
if not success:
return message, []
# Step 3: Update Hugging Face dataset
success, hf_message = manager.update_huggingface_dataset(renamed_files)
return f"{message}\n{hf_message}", renamed_files
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# Sports Cards Dataset Processor")
with gr.Box():
gr.Markdown("### Instructions: Upload from Google Drive and Update Hugging Face Dataset")
with gr.Row():
folder_id = gr.Textbox(label="Google Drive Folder ID", placeholder="Enter the folder ID")
naming_convention = gr.Textbox(label="Naming Convention", placeholder="e.g., sports_card")
process_btn = gr.Button("Process Images")
output = gr.Textbox(label="Status")
output_table = gr.Dataframe(label="Processed Files", headers=["Original Name", "New Name", "File Path"])
def process_ui(folder_id, naming_convention):
status, renamed_files = process_pipeline(folder_id, naming_convention)
table_data = [[file['original_name'], file['new_name'], file['file_path']] for file in renamed_files]
return status, table_data
process_btn.click(process_ui, inputs=[folder_id, naming_convention], outputs=[output, output_table])
if __name__ == "__main__":
demo.launch()