davanstrien HF staff commited on
Commit
4017ee6
1 Parent(s): 1077963
Files changed (1) hide show
  1. app.py +69 -65
app.py CHANGED
@@ -121,15 +121,6 @@ def process_pdfs(
121
  ),
122
  )
123
 
124
- if oauth_token is None:
125
- return (
126
- None,
127
- None,
128
- gr.Markdown(
129
- "⚠️ Not logged in to Hugging Face. Please log in to upload to a Hugging Face dataset."
130
- ),
131
- )
132
-
133
  try:
134
  temp_dir = tempfile.mkdtemp()
135
  images_dir = os.path.join(temp_dir, "images")
@@ -166,61 +157,67 @@ def process_pdfs(
166
  message += f"\nCreated zip file with {len(images)} images"
167
 
168
  if hf_repo:
169
- try:
170
- hf_api = HfApi(token=oauth_token.token)
171
- hf_api.create_repo(
172
- hf_repo,
173
- repo_type="dataset",
174
- private=private_repo,
175
- )
176
- # Upload only the sampled images directory
177
- hf_api.upload_folder(
178
- folder_path=sampled_images_dir,
179
- repo_id=hf_repo,
180
- repo_type="dataset",
181
- path_in_repo="images",
182
- )
183
-
184
- # Determine size category
185
- size_category = get_size_category(len(images))
186
-
187
- # Create DatasetCardData instance
188
- card_data = DatasetCardData(
189
- tags=["created-with-pdfs-to-page-images-converter", "pdf-to-image"],
190
- size_categories=[size_category],
191
- )
192
-
193
- # Create and populate the dataset card
194
- card = DatasetCard.from_template(
195
- card_data,
196
- template_path=None, # Use default template
197
- hf_repo=hf_repo,
198
- num_images=len(images),
199
- num_pdfs=len(pdf_files),
200
- sample_size=sample_percentage
201
- if sample_percentage > 0
202
- else "All pages",
203
- creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
204
- )
205
-
206
- # Add our custom content to the card
207
- card.text = DATASET_CARD_TEMPLATE.format(
208
- hf_repo=hf_repo,
209
- num_images=len(images),
210
- num_pdfs=len(pdf_files),
211
- sample_size=sample_percentage
212
- if sample_percentage > 0
213
- else "All pages",
214
- creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
215
- size_category=size_category,
216
- )
217
-
218
- repo_url = f"https://huggingface.co/datasets/{hf_repo}"
219
- message += f"\nUploaded dataset card to Hugging Face repo: [{hf_repo}]({repo_url})"
220
-
221
- card.push_to_hub(hf_repo, token=oauth_token.token)
222
- except Exception as e:
223
- message += f"\nFailed to upload to Hugging Face: {str(e)}"
 
 
 
 
 
 
224
 
225
  return images, zip_path, message
226
  except Exception as e:
@@ -292,7 +289,14 @@ with gr.Blocks() as demo:
292
  submit_button = gr.Button("Convert PDFs to page images")
293
  submit_button.click(
294
  process_pdfs,
295
- inputs=[pdf_files, sample_percentage, hf_repo, create_zip, private_repo],
 
 
 
 
 
 
 
296
  outputs=[output_gallery, download_button, status_text],
297
  )
298
 
 
121
  ),
122
  )
123
 
 
 
 
 
 
 
 
 
 
124
  try:
125
  temp_dir = tempfile.mkdtemp()
126
  images_dir = os.path.join(temp_dir, "images")
 
157
  message += f"\nCreated zip file with {len(images)} images"
158
 
159
  if hf_repo:
160
+ if oauth_token is None:
161
+ message += "\n⚠️ Not logged in to Hugging Face. Please log in to upload to a Hugging Face dataset."
162
+ else:
163
+ try:
164
+ hf_api = HfApi(token=oauth_token.token)
165
+ hf_api.create_repo(
166
+ hf_repo,
167
+ repo_type="dataset",
168
+ private=private_repo,
169
+ )
170
+ # Upload only the sampled images directory
171
+ hf_api.upload_folder(
172
+ folder_path=sampled_images_dir,
173
+ repo_id=hf_repo,
174
+ repo_type="dataset",
175
+ path_in_repo="images",
176
+ )
177
+
178
+ # Determine size category
179
+ size_category = get_size_category(len(images))
180
+
181
+ # Create DatasetCardData instance
182
+ card_data = DatasetCardData(
183
+ tags=[
184
+ "created-with-pdfs-to-page-images-converter",
185
+ "pdf-to-image",
186
+ ],
187
+ size_categories=[size_category],
188
+ )
189
+
190
+ # Create and populate the dataset card
191
+ card = DatasetCard.from_template(
192
+ card_data,
193
+ template_path=None, # Use default template
194
+ hf_repo=hf_repo,
195
+ num_images=len(images),
196
+ num_pdfs=len(pdf_files),
197
+ sample_size=sample_percentage
198
+ if sample_percentage > 0
199
+ else "All pages",
200
+ creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
201
+ )
202
+
203
+ # Add our custom content to the card
204
+ card.text = DATASET_CARD_TEMPLATE.format(
205
+ hf_repo=hf_repo,
206
+ num_images=len(images),
207
+ num_pdfs=len(pdf_files),
208
+ sample_size=sample_percentage
209
+ if sample_percentage > 0
210
+ else "All pages",
211
+ creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
212
+ size_category=size_category,
213
+ )
214
+
215
+ repo_url = f"https://huggingface.co/datasets/{hf_repo}"
216
+ message += f"\nUploaded dataset card to Hugging Face repo: [{hf_repo}]({repo_url})"
217
+
218
+ card.push_to_hub(hf_repo, token=oauth_token.token)
219
+ except Exception as e:
220
+ message += f"\nFailed to upload to Hugging Face: {str(e)}"
221
 
222
  return images, zip_path, message
223
  except Exception as e:
 
289
  submit_button = gr.Button("Convert PDFs to page images")
290
  submit_button.click(
291
  process_pdfs,
292
+ inputs=[
293
+ pdf_files,
294
+ sample_percentage,
295
+ hf_repo,
296
+ create_zip,
297
+ private_repo,
298
+ gr.OAuthToken(),
299
+ ],
300
  outputs=[output_gallery, download_button, status_text],
301
  )
302