davanstrien HF staff commited on
Commit
24052a1
·
1 Parent(s): 6532453
Files changed (1) hide show
  1. app.py +64 -61
app.py CHANGED
@@ -121,6 +121,15 @@ def process_pdfs(
121
  ),
122
  )
123
 
 
 
 
 
 
 
 
 
 
124
  try:
125
  temp_dir = tempfile.mkdtemp()
126
  images_dir = os.path.join(temp_dir, "images")
@@ -157,67 +166,61 @@ def process_pdfs(
157
  message += f"\nCreated zip file with {len(images)} images"
158
 
159
  if hf_repo:
160
- if oauth_token is None:
161
- message += "\n⚠️ Not logged in to Hugging Face. Please log in to upload to a Hugging Face dataset."
162
- else:
163
- try:
164
- hf_api = HfApi(token=oauth_token.token)
165
- hf_api.create_repo(
166
- hf_repo,
167
- repo_type="dataset",
168
- private=private_repo,
169
- )
170
- # Upload only the sampled images directory
171
- hf_api.upload_folder(
172
- folder_path=sampled_images_dir,
173
- repo_id=hf_repo,
174
- repo_type="dataset",
175
- path_in_repo="images",
176
- )
177
-
178
- # Determine size category
179
- size_category = get_size_category(len(images))
180
-
181
- # Create DatasetCardData instance
182
- card_data = DatasetCardData(
183
- tags=[
184
- "created-with-pdfs-to-page-images-converter",
185
- "pdf-to-image",
186
- ],
187
- size_categories=[size_category],
188
- )
189
-
190
- # Create and populate the dataset card
191
- card = DatasetCard.from_template(
192
- card_data,
193
- template_path=None, # Use default template
194
- hf_repo=hf_repo,
195
- num_images=len(images),
196
- num_pdfs=len(pdf_files),
197
- sample_size=sample_percentage
198
- if sample_percentage > 0
199
- else "All pages",
200
- creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
201
- )
202
-
203
- # Add our custom content to the card
204
- card.text = DATASET_CARD_TEMPLATE.format(
205
- hf_repo=hf_repo,
206
- num_images=len(images),
207
- num_pdfs=len(pdf_files),
208
- sample_size=sample_percentage
209
- if sample_percentage > 0
210
- else "All pages",
211
- creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
212
- size_category=size_category,
213
- )
214
-
215
- repo_url = f"https://huggingface.co/datasets/{hf_repo}"
216
- message += f"\nUploaded dataset card to Hugging Face repo: [{hf_repo}]({repo_url})"
217
-
218
- card.push_to_hub(hf_repo, token=oauth_token.token)
219
- except Exception as e:
220
- message += f"\nFailed to upload to Hugging Face: {str(e)}"
221
 
222
  return images, zip_path, message
223
  except Exception as e:
 
121
  ),
122
  )
123
 
124
+ if oauth_token is None:
125
+ return (
126
+ None,
127
+ None,
128
+ gr.Markdown(
129
+ "⚠️ Not logged in to Hugging Face. Please log in to upload to a Hugging Face dataset."
130
+ ),
131
+ )
132
+
133
  try:
134
  temp_dir = tempfile.mkdtemp()
135
  images_dir = os.path.join(temp_dir, "images")
 
166
  message += f"\nCreated zip file with {len(images)} images"
167
 
168
  if hf_repo:
169
+ try:
170
+ hf_api = HfApi(token=oauth_token.token)
171
+ hf_api.create_repo(
172
+ hf_repo,
173
+ repo_type="dataset",
174
+ private=private_repo,
175
+ )
176
+ # Upload only the sampled images directory
177
+ hf_api.upload_folder(
178
+ folder_path=sampled_images_dir,
179
+ repo_id=hf_repo,
180
+ repo_type="dataset",
181
+ path_in_repo="images",
182
+ )
183
+
184
+ # Determine size category
185
+ size_category = get_size_category(len(images))
186
+
187
+ # Create DatasetCardData instance
188
+ card_data = DatasetCardData(
189
+ tags=["created-with-pdfs-to-page-images-converter", "pdf-to-image"],
190
+ size_categories=[size_category],
191
+ )
192
+
193
+ # Create and populate the dataset card
194
+ card = DatasetCard.from_template(
195
+ card_data,
196
+ template_path=None, # Use default template
197
+ hf_repo=hf_repo,
198
+ num_images=len(images),
199
+ num_pdfs=len(pdf_files),
200
+ sample_size=sample_percentage
201
+ if sample_percentage > 0
202
+ else "All pages",
203
+ creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
204
+ )
205
+
206
+ # Add our custom content to the card
207
+ card.text = DATASET_CARD_TEMPLATE.format(
208
+ hf_repo=hf_repo,
209
+ num_images=len(images),
210
+ num_pdfs=len(pdf_files),
211
+ sample_size=sample_percentage
212
+ if sample_percentage > 0
213
+ else "All pages",
214
+ creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
215
+ size_category=size_category,
216
+ )
217
+
218
+ repo_url = f"https://huggingface.co/datasets/{hf_repo}"
219
+ message += f"\nUploaded dataset card to Hugging Face repo: [{hf_repo}]({repo_url})"
220
+
221
+ card.push_to_hub(hf_repo, token=oauth_token.token)
222
+ except Exception as e:
223
+ message += f"\nFailed to upload to Hugging Face: {str(e)}"
 
 
 
 
 
 
224
 
225
  return images, zip_path, message
226
  except Exception as e: