GotThatData commited on
Commit
4f13b31
·
verified ·
1 Parent(s): 095edd3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -26
app.py CHANGED
@@ -14,10 +14,10 @@ logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
  class DatasetManager:
17
- def __init__(self, dataset_name=None, local_images_dir="downloaded_cards"):
18
- self.dataset_name = dataset_name
19
  self.local_images_dir = local_images_dir
20
  self.drive = None
 
21
 
22
  # Create local directory if it doesn't exist
23
  os.makedirs(local_images_dir, exist_ok=True)
@@ -43,12 +43,27 @@ class DatasetManager:
43
  file_list = self.drive.ListFile({'q': query}).GetList()
44
 
45
  if not file_list:
46
- return False, "No files found in the specified folder", []
 
 
 
 
 
47
 
48
  renamed_files = []
 
 
 
 
 
 
 
 
 
 
49
  for i, file in enumerate(tqdm(file_list, desc="Downloading files")):
50
  if file['mimeType'].startswith('image/'):
51
- new_filename = f"{naming_convention}_{i+1}.jpg"
52
  file_path = os.path.join(self.local_images_dir, new_filename)
53
 
54
  # Download file
@@ -61,7 +76,8 @@ class DatasetManager:
61
  renamed_files.append({
62
  'file_path': file_path,
63
  'original_name': file['title'],
64
- 'new_name': new_filename
 
65
  })
66
  except Exception as e:
67
  logger.error(f"Error processing image {file['title']}: {str(e)}")
@@ -72,23 +88,32 @@ class DatasetManager:
72
  except Exception as e:
73
  return False, f"Error downloading files: {str(e)}", []
74
 
75
- def update_huggingface_dataset(self, dataset_name, renamed_files):
76
- """Update or create Hugging Face dataset with new images"""
77
  try:
78
  # Create a DataFrame with the file information
79
  df = pd.DataFrame(renamed_files)
80
 
81
- # Create a Hugging Face Dataset
82
- dataset = Dataset.from_pandas(df)
 
 
 
 
 
 
 
 
 
83
 
84
  # Push to Hugging Face Hub
85
- dataset.push_to_hub(dataset_name)
86
 
87
- return True, f"Successfully updated dataset '{dataset_name}' with {len(renamed_files)} images"
88
  except Exception as e:
89
  return False, f"Error updating Hugging Face dataset: {str(e)}"
90
 
91
- def process_pipeline(folder_id, naming_convention, dataset_name):
92
  """Main pipeline to process images and update dataset"""
93
  manager = DatasetManager()
94
 
@@ -103,33 +128,27 @@ def process_pipeline(folder_id, naming_convention, dataset_name):
103
  return message
104
 
105
  # Step 3: Update Hugging Face dataset
106
- if dataset_name:
107
- success, hf_message = manager.update_huggingface_dataset(dataset_name, renamed_files)
108
- return f"{message}\n{hf_message}"
109
-
110
- return message
111
 
112
  # Gradio interface
113
  demo = gr.Interface(
114
  fn=process_pipeline,
115
  inputs=[
116
  gr.Textbox(
117
- label="Google Drive Folder ID",
118
- placeholder="Enter the folder ID from your Google Drive URL"
 
119
  ),
120
  gr.Textbox(
121
  label="Naming Convention",
122
  placeholder="e.g., card",
123
- value="card"
124
- ),
125
- gr.Textbox(
126
- label="Hugging Face Dataset Name (Optional)",
127
- placeholder="username/dataset-name"
128
  )
129
  ],
130
  outputs=gr.Textbox(label="Status"),
131
- title="Card Image Processor",
132
- description="Download card images from Google Drive and add them to your Hugging Face dataset"
133
  )
134
 
135
  if __name__ == "__main__":
 
14
  logger = logging.getLogger(__name__)
15
 
16
  class DatasetManager:
17
+ def __init__(self, local_images_dir="downloaded_cards"):
 
18
  self.local_images_dir = local_images_dir
19
  self.drive = None
20
+ self.dataset_name = "GotThatData/sports-cards"
21
 
22
  # Create local directory if it doesn't exist
23
  os.makedirs(local_images_dir, exist_ok=True)
 
43
  file_list = self.drive.ListFile({'q': query}).GetList()
44
 
45
  if not file_list:
46
+ # Try to get single file if folder is empty
47
+ file = self.drive.CreateFile({'id': drive_folder_id})
48
+ if file:
49
+ file_list = [file]
50
+ else:
51
+ return False, "No files found with the specified ID", []
52
 
53
  renamed_files = []
54
+ existing_dataset = None
55
+ try:
56
+ existing_dataset = load_dataset(self.dataset_name)
57
+ logger.info(f"Loaded existing dataset: {self.dataset_name}")
58
+ # Get the current count of images to continue numbering
59
+ start_index = len(existing_dataset['train']) if 'train' in existing_dataset else 0
60
+ except Exception as e:
61
+ logger.info(f"No existing dataset found, starting fresh: {str(e)}")
62
+ start_index = 0
63
+
64
  for i, file in enumerate(tqdm(file_list, desc="Downloading files")):
65
  if file['mimeType'].startswith('image/'):
66
+ new_filename = f"{naming_convention}_{start_index + i + 1}.jpg"
67
  file_path = os.path.join(self.local_images_dir, new_filename)
68
 
69
  # Download file
 
76
  renamed_files.append({
77
  'file_path': file_path,
78
  'original_name': file['title'],
79
+ 'new_name': new_filename,
80
+ 'image': file_path # Adding image column for dataset
81
  })
82
  except Exception as e:
83
  logger.error(f"Error processing image {file['title']}: {str(e)}")
 
88
  except Exception as e:
89
  return False, f"Error downloading files: {str(e)}", []
90
 
91
+ def update_huggingface_dataset(self, renamed_files):
92
+ """Update the sports-cards dataset with new images"""
93
  try:
94
  # Create a DataFrame with the file information
95
  df = pd.DataFrame(renamed_files)
96
 
97
+ # Create a Hugging Face Dataset from the new files
98
+ new_dataset = Dataset.from_pandas(df)
99
+
100
+ try:
101
+ # Try to load existing dataset
102
+ existing_dataset = load_dataset(self.dataset_name)
103
+ # Concatenate with existing dataset if it exists
104
+ if 'train' in existing_dataset:
105
+ new_dataset = concatenate_datasets([existing_dataset['train'], new_dataset])
106
+ except Exception:
107
+ logger.info("Creating new dataset")
108
 
109
  # Push to Hugging Face Hub
110
+ new_dataset.push_to_hub(self.dataset_name, split="train")
111
 
112
+ return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images"
113
  except Exception as e:
114
  return False, f"Error updating Hugging Face dataset: {str(e)}"
115
 
116
+ def process_pipeline(folder_id, naming_convention):
117
  """Main pipeline to process images and update dataset"""
118
  manager = DatasetManager()
119
 
 
128
  return message
129
 
130
  # Step 3: Update Hugging Face dataset
131
+ success, hf_message = manager.update_huggingface_dataset(renamed_files)
132
+ return f"{message}\n{hf_message}"
 
 
 
133
 
134
  # Gradio interface
135
  demo = gr.Interface(
136
  fn=process_pipeline,
137
  inputs=[
138
  gr.Textbox(
139
+ label="Google Drive File/Folder ID",
140
+ placeholder="Enter the ID from your Google Drive URL",
141
+ value="151VOxPO91mg0C3ORiioGUd4hogzP1ujm" # Pre-filled with provided ID
142
  ),
143
  gr.Textbox(
144
  label="Naming Convention",
145
  placeholder="e.g., card",
146
+ value="sports_card"
 
 
 
 
147
  )
148
  ],
149
  outputs=gr.Textbox(label="Status"),
150
+ title="Sports Cards Dataset Processor",
151
+ description="Download card images from Google Drive and add them to the sports-cards dataset"
152
  )
153
 
154
  if __name__ == "__main__":