File size: 6,247 Bytes
85ad390
 
 
bcde0da
85ad390
 
 
 
 
 
bcde0da
85ad390
 
 
bcde0da
85ad390
4f13b31
85ad390
 
4f13b31
85ad390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f13b31
 
 
 
 
 
85ad390
 
4f13b31
 
 
 
 
 
 
 
 
 
85ad390
 
4f13b31
85ad390
 
 
 
 
 
 
 
 
 
 
 
4f13b31
 
85ad390
 
 
 
 
 
 
 
 
 
4f13b31
 
85ad390
 
 
 
4f13b31
 
 
 
 
 
 
 
 
 
 
85ad390
 
4f13b31
85ad390
4f13b31
85ad390
 
 
4f13b31
85ad390
 
 
 
 
 
 
 
 
 
 
 
 
 
4f13b31
 
85ad390
 
 
 
 
 
4f13b31
 
 
85ad390
 
 
 
4f13b31
85ad390
 
 
4f13b31
 
85ad390
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
import os
import gradio as gr
from datasets import load_dataset, Dataset
import pandas as pd
from PIL import Image
import shutil
from tqdm import tqdm
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DatasetManager:
    def __init__(self, local_images_dir="downloaded_cards"):
        self.local_images_dir = local_images_dir
        self.drive = None
        self.dataset_name = "GotThatData/sports-cards"
        
        # Create local directory if it doesn't exist
        os.makedirs(local_images_dir, exist_ok=True)
    
    def authenticate_drive(self):
        """Authenticate with Google Drive"""
        try:
            gauth = GoogleAuth()
            gauth.LocalWebserverAuth()
            self.drive = GoogleDrive(gauth)
            return True, "Successfully authenticated with Google Drive"
        except Exception as e:
            return False, f"Authentication failed: {str(e)}"

    def download_and_rename_files(self, drive_folder_id, naming_convention):
        """Download files from Google Drive and rename them"""
        if not self.drive:
            return False, "Google Drive not authenticated", []
        
        try:
            # List files in the folder
            query = f"'{drive_folder_id}' in parents and trashed=false"
            file_list = self.drive.ListFile({'q': query}).GetList()
            
            if not file_list:
                # Try to get single file if folder is empty
                file = self.drive.CreateFile({'id': drive_folder_id})
                if file:
                    file_list = [file]
                else:
                    return False, "No files found with the specified ID", []
            
            renamed_files = []
            existing_dataset = None
            try:
                existing_dataset = load_dataset(self.dataset_name)
                logger.info(f"Loaded existing dataset: {self.dataset_name}")
                # Get the current count of images to continue numbering
                start_index = len(existing_dataset['train']) if 'train' in existing_dataset else 0
            except Exception as e:
                logger.info(f"No existing dataset found, starting fresh: {str(e)}")
                start_index = 0
            
            for i, file in enumerate(tqdm(file_list, desc="Downloading files")):
                if file['mimeType'].startswith('image/'):
                    new_filename = f"{naming_convention}_{start_index + i + 1}.jpg"
                    file_path = os.path.join(self.local_images_dir, new_filename)
                    
                    # Download file
                    file.GetContentFile(file_path)
                    
                    # Verify the image can be opened
                    try:
                        with Image.open(file_path) as img:
                            img.verify()
                        renamed_files.append({
                            'file_path': file_path,
                            'original_name': file['title'],
                            'new_name': new_filename,
                            'image': file_path  # Adding image column for dataset
                        })
                    except Exception as e:
                        logger.error(f"Error processing image {file['title']}: {str(e)}")
                        if os.path.exists(file_path):
                            os.remove(file_path)
            
            return True, f"Successfully processed {len(renamed_files)} images", renamed_files
        except Exception as e:
            return False, f"Error downloading files: {str(e)}", []

    def update_huggingface_dataset(self, renamed_files):
        """Update the sports-cards dataset with new images"""
        try:
            # Create a DataFrame with the file information
            df = pd.DataFrame(renamed_files)
            
            # Create a Hugging Face Dataset from the new files
            new_dataset = Dataset.from_pandas(df)
            
            try:
                # Try to load existing dataset
                existing_dataset = load_dataset(self.dataset_name)
                # Concatenate with existing dataset if it exists
                if 'train' in existing_dataset:
                    new_dataset = concatenate_datasets([existing_dataset['train'], new_dataset])
            except Exception:
                logger.info("Creating new dataset")
            
            # Push to Hugging Face Hub
            new_dataset.push_to_hub(self.dataset_name, split="train")
            
            return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images"
        except Exception as e:
            return False, f"Error updating Hugging Face dataset: {str(e)}"

def process_pipeline(folder_id, naming_convention):
    """Main pipeline to process images and update dataset"""
    manager = DatasetManager()
    
    # Step 1: Authenticate
    auth_success, auth_message = manager.authenticate_drive()
    if not auth_success:
        return auth_message
    
    # Step 2: Download and rename files
    success, message, renamed_files = manager.download_and_rename_files(folder_id, naming_convention)
    if not success:
        return message
    
    # Step 3: Update Hugging Face dataset
    success, hf_message = manager.update_huggingface_dataset(renamed_files)
    return f"{message}\n{hf_message}"

# Gradio interface
demo = gr.Interface(
    fn=process_pipeline,
    inputs=[
        gr.Textbox(
            label="Google Drive File/Folder ID",
            placeholder="Enter the ID from your Google Drive URL",
            value="151VOxPO91mg0C3ORiioGUd4hogzP1ujm"  # Pre-filled with provided ID
        ),
        gr.Textbox(
            label="Naming Convention",
            placeholder="e.g., card",
            value="sports_card"
        )
    ],
    outputs=gr.Textbox(label="Status"),
    title="Sports Cards Dataset Processor",
    description="Download card images from Google Drive and add them to the sports-cards dataset"
)

if __name__ == "__main__":
    demo.launch()