GotThatData commited on
Commit
8067321
·
verified ·
1 Parent(s): e8c7eab
Files changed (1) hide show
  1. app.py +66 -37
app.py CHANGED
@@ -30,23 +30,18 @@ class DatasetManager:
30
  """Authenticate with Google Drive"""
31
  try:
32
  gauth = GoogleAuth()
33
- # Use the settings from yaml file
34
  gauth.settings['client_config_file'] = settings['client_secrets_file']
35
 
36
  # Try to load saved credentials
37
  gauth.LoadCredentialsFile("credentials.txt")
38
 
39
  if gauth.credentials is None:
40
- # Authenticate if no credentials found
41
  gauth.LocalWebserverAuth()
42
  elif gauth.access_token_expired:
43
- # Refresh them if expired
44
  gauth.Refresh()
45
  else:
46
- # Initialize the saved credentials
47
  gauth.Authorize()
48
 
49
- # Save the credentials for future use
50
  gauth.SaveCredentialsFile("credentials.txt")
51
 
52
  self.drive = GoogleDrive(gauth)
@@ -60,12 +55,10 @@ class DatasetManager:
60
  return False, "Google Drive not authenticated", []
61
 
62
  try:
63
- # List files in the folder
64
  query = f"'{drive_folder_id}' in parents and trashed=false"
65
  file_list = self.drive.ListFile({'q': query}).GetList()
66
 
67
  if not file_list:
68
- # Try to get single file if folder is empty
69
  file = self.drive.CreateFile({'id': drive_folder_id})
70
  if file:
71
  file_list = [file]
@@ -73,7 +66,6 @@ class DatasetManager:
73
  return False, "No files found with the specified ID", []
74
 
75
  renamed_files = []
76
- existing_dataset = None
77
  try:
78
  existing_dataset = load_dataset(self.dataset_name)
79
  logger.info(f"Loaded existing dataset: {self.dataset_name}")
@@ -87,10 +79,8 @@ class DatasetManager:
87
  new_filename = f"{naming_convention}_{start_index + i + 1}.jpg"
88
  file_path = os.path.join(self.local_images_dir, new_filename)
89
 
90
- # Download file
91
  file.GetContentFile(file_path)
92
 
93
- # Verify the image can be opened
94
  try:
95
  with Image.open(file_path) as img:
96
  img.verify()
@@ -112,22 +102,16 @@ class DatasetManager:
112
  def update_huggingface_dataset(self, renamed_files):
113
  """Update the sports-cards dataset with new images"""
114
  try:
115
- # Create a DataFrame with the file information
116
  df = pd.DataFrame(renamed_files)
117
-
118
- # Create a Hugging Face Dataset
119
  new_dataset = Dataset.from_pandas(df)
120
 
121
  try:
122
- # Try to load existing dataset
123
  existing_dataset = load_dataset(self.dataset_name)
124
- # Concatenate with existing dataset if it exists
125
  if 'train' in existing_dataset:
126
  new_dataset = concatenate_datasets([existing_dataset['train'], new_dataset])
127
  except Exception:
128
  logger.info("Creating new dataset")
129
 
130
- # Push to Hugging Face Hub
131
  new_dataset.push_to_hub(self.dataset_name, split="train")
132
 
133
  return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images"
@@ -138,39 +122,84 @@ def process_pipeline(folder_id, naming_convention):
138
  """Main pipeline to process images and update dataset"""
139
  manager = DatasetManager()
140
 
141
- # Step 1: Authenticate
142
  auth_success, auth_message = manager.authenticate_drive()
143
  if not auth_success:
144
  return auth_message
145
 
146
- # Step 2: Download and rename files
147
  success, message, renamed_files = manager.download_and_rename_files(folder_id, naming_convention)
148
  if not success:
149
  return message
150
 
151
- # Step 3: Update Hugging Face dataset
152
  success, hf_message = manager.update_huggingface_dataset(renamed_files)
153
  return f"{message}\n{hf_message}"
154
 
155
- # Gradio interface
156
- demo = gr.Interface(
157
- fn=process_pipeline,
158
- inputs=[
159
- gr.Textbox(
160
- label="Google Drive File/Folder ID",
161
- placeholder="Enter the ID from your Google Drive URL",
162
- value="151VOxPO91mg0C3ORiioGUd4hogzP1ujm"
163
- ),
164
- gr.Textbox(
165
- label="Naming Convention",
166
- placeholder="e.g., sports_card",
167
- value="sports_card"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  )
169
- ],
170
- outputs=gr.Textbox(label="Status"),
171
- title="Sports Cards Dataset Processor",
172
- description="Download card images from Google Drive and add them to the sports-cards dataset"
173
- )
 
174
 
175
  if __name__ == "__main__":
176
  demo.launch()
 
30
  """Authenticate with Google Drive"""
31
  try:
32
  gauth = GoogleAuth()
 
33
  gauth.settings['client_config_file'] = settings['client_secrets_file']
34
 
35
  # Try to load saved credentials
36
  gauth.LoadCredentialsFile("credentials.txt")
37
 
38
  if gauth.credentials is None:
 
39
  gauth.LocalWebserverAuth()
40
  elif gauth.access_token_expired:
 
41
  gauth.Refresh()
42
  else:
 
43
  gauth.Authorize()
44
 
 
45
  gauth.SaveCredentialsFile("credentials.txt")
46
 
47
  self.drive = GoogleDrive(gauth)
 
55
  return False, "Google Drive not authenticated", []
56
 
57
  try:
 
58
  query = f"'{drive_folder_id}' in parents and trashed=false"
59
  file_list = self.drive.ListFile({'q': query}).GetList()
60
 
61
  if not file_list:
 
62
  file = self.drive.CreateFile({'id': drive_folder_id})
63
  if file:
64
  file_list = [file]
 
66
  return False, "No files found with the specified ID", []
67
 
68
  renamed_files = []
 
69
  try:
70
  existing_dataset = load_dataset(self.dataset_name)
71
  logger.info(f"Loaded existing dataset: {self.dataset_name}")
 
79
  new_filename = f"{naming_convention}_{start_index + i + 1}.jpg"
80
  file_path = os.path.join(self.local_images_dir, new_filename)
81
 
 
82
  file.GetContentFile(file_path)
83
 
 
84
  try:
85
  with Image.open(file_path) as img:
86
  img.verify()
 
102
  def update_huggingface_dataset(self, renamed_files):
103
  """Update the sports-cards dataset with new images"""
104
  try:
 
105
  df = pd.DataFrame(renamed_files)
 
 
106
  new_dataset = Dataset.from_pandas(df)
107
 
108
  try:
 
109
  existing_dataset = load_dataset(self.dataset_name)
 
110
  if 'train' in existing_dataset:
111
  new_dataset = concatenate_datasets([existing_dataset['train'], new_dataset])
112
  except Exception:
113
  logger.info("Creating new dataset")
114
 
 
115
  new_dataset.push_to_hub(self.dataset_name, split="train")
116
 
117
  return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images"
 
122
  """Main pipeline to process images and update dataset"""
123
  manager = DatasetManager()
124
 
 
125
  auth_success, auth_message = manager.authenticate_drive()
126
  if not auth_success:
127
  return auth_message
128
 
 
129
  success, message, renamed_files = manager.download_and_rename_files(folder_id, naming_convention)
130
  if not success:
131
  return message
132
 
 
133
  success, hf_message = manager.update_huggingface_dataset(renamed_files)
134
  return f"{message}\n{hf_message}"
135
 
136
+ # Custom CSS for web-safe fonts and clean styling
137
+ custom_css = """
138
+ .gradio-container {
139
+ font-family: Arial, sans-serif !important;
140
+ }
141
+
142
+ h1, h2, h3 {
143
+ font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
144
+ font-weight: 600 !important;
145
+ }
146
+
147
+ .gr-button {
148
+ font-family: Arial, sans-serif !important;
149
+ }
150
+
151
+ .gr-input {
152
+ font-family: 'Courier New', Courier, monospace !important;
153
+ }
154
+
155
+ .gr-box {
156
+ border-radius: 8px !important;
157
+ border: 1px solid #e5e5e5 !important;
158
+ }
159
+
160
+ .gr-padded {
161
+ padding: 16px !important;
162
+ }
163
+ """
164
+
165
+ # Gradio interface with custom theme
166
+ with gr.Blocks(css=custom_css) as demo:
167
+ gr.Markdown("# Sports Cards Dataset Processor")
168
+
169
+ with gr.Box():
170
+ gr.Markdown("""
171
+ ### Instructions
172
+ 1. Enter the Google Drive folder/file ID
173
+ 2. Choose a naming convention for your cards
174
+ 3. Click Process to start
175
+ """)
176
+
177
+ with gr.Row():
178
+ with gr.Column():
179
+ folder_id = gr.Textbox(
180
+ label="Google Drive File/Folder ID",
181
+ placeholder="Enter the ID from your Google Drive URL",
182
+ value="151VOxPO91mg0C3ORiioGUd4hogzP1ujm"
183
+ )
184
+ naming = gr.Textbox(
185
+ label="Naming Convention",
186
+ placeholder="e.g., sports_card",
187
+ value="sports_card"
188
+ )
189
+ process_btn = gr.Button("Process Images", variant="primary")
190
+
191
+ with gr.Box():
192
+ output = gr.Textbox(
193
+ label="Processing Status",
194
+ show_label=True,
195
+ lines=5
196
  )
197
+
198
+ process_btn.click(
199
+ fn=process_pipeline,
200
+ inputs=[folder_id, naming],
201
+ outputs=output
202
+ )
203
 
204
  if __name__ == "__main__":
205
  demo.launch()