File size: 5,077 Bytes
85ad390
 
 
bcde0da
85ad390
 
 
 
 
 
bcde0da
85ad390
 
 
bcde0da
85ad390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
import os
import gradio as gr
from datasets import load_dataset, Dataset
import pandas as pd
from PIL import Image
import shutil
from tqdm import tqdm
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DatasetManager:
    def __init__(self, dataset_name=None, local_images_dir="downloaded_cards"):
        self.dataset_name = dataset_name
        self.local_images_dir = local_images_dir
        self.drive = None
        
        # Create local directory if it doesn't exist
        os.makedirs(local_images_dir, exist_ok=True)
    
    def authenticate_drive(self):
        """Authenticate with Google Drive"""
        try:
            gauth = GoogleAuth()
            gauth.LocalWebserverAuth()
            self.drive = GoogleDrive(gauth)
            return True, "Successfully authenticated with Google Drive"
        except Exception as e:
            return False, f"Authentication failed: {str(e)}"

    def download_and_rename_files(self, drive_folder_id, naming_convention):
        """Download files from Google Drive and rename them"""
        if not self.drive:
            return False, "Google Drive not authenticated", []
        
        try:
            # List files in the folder
            query = f"'{drive_folder_id}' in parents and trashed=false"
            file_list = self.drive.ListFile({'q': query}).GetList()
            
            if not file_list:
                return False, "No files found in the specified folder", []
            
            renamed_files = []
            for i, file in enumerate(tqdm(file_list, desc="Downloading files")):
                if file['mimeType'].startswith('image/'):
                    new_filename = f"{naming_convention}_{i+1}.jpg"
                    file_path = os.path.join(self.local_images_dir, new_filename)
                    
                    # Download file
                    file.GetContentFile(file_path)
                    
                    # Verify the image can be opened
                    try:
                        with Image.open(file_path) as img:
                            img.verify()
                        renamed_files.append({
                            'file_path': file_path,
                            'original_name': file['title'],
                            'new_name': new_filename
                        })
                    except Exception as e:
                        logger.error(f"Error processing image {file['title']}: {str(e)}")
                        if os.path.exists(file_path):
                            os.remove(file_path)
            
            return True, f"Successfully processed {len(renamed_files)} images", renamed_files
        except Exception as e:
            return False, f"Error downloading files: {str(e)}", []

    def update_huggingface_dataset(self, dataset_name, renamed_files):
        """Update or create Hugging Face dataset with new images"""
        try:
            # Create a DataFrame with the file information
            df = pd.DataFrame(renamed_files)
            
            # Create a Hugging Face Dataset
            dataset = Dataset.from_pandas(df)
            
            # Push to Hugging Face Hub
            dataset.push_to_hub(dataset_name)
            
            return True, f"Successfully updated dataset '{dataset_name}' with {len(renamed_files)} images"
        except Exception as e:
            return False, f"Error updating Hugging Face dataset: {str(e)}"

def process_pipeline(folder_id, naming_convention, dataset_name):
    """Main pipeline to process images and update dataset"""
    manager = DatasetManager()
    
    # Step 1: Authenticate
    auth_success, auth_message = manager.authenticate_drive()
    if not auth_success:
        return auth_message
    
    # Step 2: Download and rename files
    success, message, renamed_files = manager.download_and_rename_files(folder_id, naming_convention)
    if not success:
        return message
    
    # Step 3: Update Hugging Face dataset
    if dataset_name:
        success, hf_message = manager.update_huggingface_dataset(dataset_name, renamed_files)
        return f"{message}\n{hf_message}"
    
    return message

# Gradio interface
demo = gr.Interface(
    fn=process_pipeline,
    inputs=[
        gr.Textbox(
            label="Google Drive Folder ID",
            placeholder="Enter the folder ID from your Google Drive URL"
        ),
        gr.Textbox(
            label="Naming Convention",
            placeholder="e.g., card",
            value="card"
        ),
        gr.Textbox(
            label="Hugging Face Dataset Name",
            placeholder="username/dataset-name (optional)",
            required=False
        )
    ],
    outputs=gr.Textbox(label="Status"),
    title="Card Image Processor",
    description="Download card images from Google Drive and add them to your Hugging Face dataset"
)

if __name__ == "__main__":
    demo.launch()