File size: 6,905 Bytes
85ad390
 
 
bcde0da
85ad390
 
 
 
 
d86b86d
bcde0da
85ad390
 
 
bcde0da
d86b86d
 
 
1552b06
85ad390
4f13b31
85ad390
 
4f13b31
85ad390
 
 
 
 
 
 
 
d86b86d
 
 
 
 
1552b06
 
d86b86d
 
1552b06
d86b86d
1552b06
 
d86b86d
1552b06
d86b86d
1552b06
d86b86d
1552b06
85ad390
 
 
 
 
9cc14fb
 
85ad390
 
 
 
 
 
 
 
 
9cc14fb
4f13b31
 
 
 
 
85ad390
9cc14fb
 
 
 
 
 
 
 
 
 
 
85ad390
9cc14fb
 
85ad390
 
9cc14fb
85ad390
9cc14fb
 
 
 
 
 
85ad390
4f13b31
9cc14fb
85ad390
9cc14fb
 
 
 
85ad390
9cc14fb
85ad390
9cc14fb
85ad390
9cc14fb
 
85ad390
 
9cc14fb
85ad390
9cc14fb
4f13b31
 
 
 
 
 
 
 
 
 
85ad390
 
4f13b31
85ad390
9cc14fb
85ad390
 
 
9cc14fb
85ad390
 
 
 
 
 
 
 
9cc14fb
 
85ad390
 
 
 
9cc14fb
 
85ad390
 
d86b86d
 
 
 
4f13b31
 
8ee319e
d86b86d
 
9cc14fb
 
 
85ad390
d86b86d
 
 
 
 
85ad390
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
import os
import gradio as gr
from datasets import load_dataset, Dataset
import pandas as pd
from PIL import Image
from tqdm import tqdm
import logging
import yaml

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load settings
with open('settings.yaml', 'r') as file:
    settings = yaml.safe_load(file)

class DatasetManager:
    def __init__(self, local_images_dir="downloaded_cards"):
        self.local_images_dir = local_images_dir
        self.drive = None
        self.dataset_name = "GotThatData/sports-cards"
        
        # Create local directory if it doesn't exist
        os.makedirs(local_images_dir, exist_ok=True)
    
    def authenticate_drive(self):
        """Authenticate with Google Drive"""
        try:
            gauth = GoogleAuth()
            # Use the settings from yaml file
            gauth.settings['client_config_file'] = settings['client_secrets_file']
            
            # Try to load saved credentials
            gauth.LoadCredentialsFile("credentials.txt")
            
            if gauth.credentials is None:
                # Authenticate if no credentials found
                gauth.LocalWebserverAuth()
            elif gauth.access_token_expired:
                # Refresh them if expired
                gauth.Refresh()
            else:
                # Initialize the saved credentials
                gauth.Authorize()
                
            # Save the credentials for future use
            gauth.SaveCredentialsFile("credentials.txt")
            
            self.drive = GoogleDrive(gauth)
            return True, "Successfully authenticated with Google Drive"
        except Exception as e:
            return False, f"Authentication failed: {str(e)}"

    def download_and_rename_files(self, drive_folder_id, naming_convention):
        """Download files from Google Drive and rename them"""
        if not self.drive:
            return False, "Google Drive not authenticated", []
        
        try:
            # List files in the folder
            query = f"'{drive_folder_id}' in parents and trashed=false"
            file_list = self.drive.ListFile({'q': query}).GetList()
            
            if not file_list:
                # Try to get single file if folder is empty
                file = self.drive.CreateFile({'id': drive_folder_id})
                if file:
                    file_list = [file]
                else:
                    return False, "No files found with the specified ID", []
            
            renamed_files = []
            existing_dataset = None
            try:
                existing_dataset = load_dataset(self.dataset_name)
                logger.info(f"Loaded existing dataset: {self.dataset_name}")
                start_index = len(existing_dataset['train']) if 'train' in existing_dataset else 0
            except Exception as e:
                logger.info(f"No existing dataset found, starting fresh: {str(e)}")
                start_index = 0
            
            for i, file in enumerate(tqdm(file_list, desc="Downloading files")):
                if file['mimeType'].startswith('image/'):
                    new_filename = f"{naming_convention}_{start_index + i + 1}.jpg"
                    file_path = os.path.join(self.local_images_dir, new_filename)
                    
                    # Download file
                    file.GetContentFile(file_path)
                    
                    # Verify the image can be opened
                    try:
                        with Image.open(file_path) as img:
                            img.verify()
                        renamed_files.append({
                            'file_path': file_path,
                            'original_name': file['title'],
                            'new_name': new_filename,
                            'image': file_path
                        })
                    except Exception as e:
                        logger.error(f"Error processing image {file['title']}: {str(e)}")
                        if os.path.exists(file_path):
                            os.remove(file_path)
            
            return True, f"Successfully processed {len(renamed_files)} images", renamed_files
        except Exception as e:
            return False, f"Error downloading files: {str(e)}", []

    def update_huggingface_dataset(self, renamed_files):
        """Update the sports-cards dataset with new images"""
        try:
            # Create a DataFrame with the file information
            df = pd.DataFrame(renamed_files)
            
            # Create a Hugging Face Dataset
            new_dataset = Dataset.from_pandas(df)
            
            try:
                # Try to load existing dataset
                existing_dataset = load_dataset(self.dataset_name)
                # Concatenate with existing dataset if it exists
                if 'train' in existing_dataset:
                    new_dataset = concatenate_datasets([existing_dataset['train'], new_dataset])
            except Exception:
                logger.info("Creating new dataset")
            
            # Push to Hugging Face Hub
            new_dataset.push_to_hub(self.dataset_name, split="train")
            
            return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images"
        except Exception as e:
            return False, f"Error updating Hugging Face dataset: {str(e)}"

def process_pipeline(folder_id, naming_convention):
    """Main pipeline to process images and update dataset"""
    manager = DatasetManager()
    
    # Step 1: Authenticate
    auth_success, auth_message = manager.authenticate_drive()
    if not auth_success:
        return auth_message
    
    # Step 2: Download and rename files
    success, message, renamed_files = manager.download_and_rename_files(folder_id, naming_convention)
    if not success:
        return message
    
    # Step 3: Update Hugging Face dataset
    success, hf_message = manager.update_huggingface_dataset(renamed_files)
    return f"{message}\n{hf_message}"

# Gradio interface
demo = gr.Interface(
    fn=process_pipeline,
    inputs=[
        gr.Textbox(
            label="Google Drive File/Folder ID",
            placeholder="Enter the ID from your Google Drive URL",
            value="151VOxPO91mg0C3ORiioGUd4hogzP1ujm"
        ),
        gr.Textbox(
            label="Naming Convention",
            placeholder="e.g., sports_card",
            value="sports_card"
        )
    ],
    outputs=gr.Textbox(label="Status"),
    title="Sports Cards Dataset Processor",
    description="Download card images from Google Drive and add them to the sports-cards dataset"
)

if __name__ == "__main__":
    demo.launch()