davanstrien HF staff commited on
Commit
952523e
·
1 Parent(s): e71f5a4

percentage

Browse files
Files changed (1) hide show
  1. app.py +26 -17
app.py CHANGED
@@ -21,25 +21,27 @@ CPU_COUNT = multiprocessing.cpu_count()
21
  MAX_WORKERS = min(32, CPU_COUNT) # Use CPU count directly for processes
22
 
23
 
24
- def process_pdf(pdf_file, sample_size, temp_dir):
25
  try:
26
  pdf_path = pdf_file.name
27
  doc = fitz.open(pdf_path)
28
  total_pages = len(doc)
29
 
30
- pages_to_convert = (
31
- total_pages if sample_size == 0 else min(sample_size, total_pages)
32
- )
 
 
33
  selected_pages = (
34
  sorted(random.sample(range(total_pages), pages_to_convert))
35
- if sample_size > 0 and sample_size < total_pages
36
  else range(total_pages)
37
  )
38
 
39
  images = []
40
  for page_num in selected_pages:
41
  page = doc[page_num]
42
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Increase resolution
43
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
44
  image_path = os.path.join(
45
  temp_dir, f"{os.path.basename(pdf_path)}_page_{page_num+1}.jpg"
@@ -53,7 +55,7 @@ def process_pdf(pdf_file, sample_size, temp_dir):
53
  return [], f"Error processing {pdf_file.name}: {str(e)}", 0
54
 
55
 
56
- def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
57
  if not os.path.exists(temp_dir):
58
  os.makedirs(temp_dir)
59
 
@@ -66,7 +68,7 @@ def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
66
 
67
  with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
68
  future_to_pdf = {
69
- executor.submit(process_pdf, pdf, sample_size, temp_dir): pdf
70
  for pdf in pdf_files
71
  }
72
 
@@ -103,7 +105,7 @@ def get_size_category(num_images):
103
 
104
  def process_pdfs(
105
  pdf_files,
106
- sample_size,
107
  hf_repo,
108
  create_zip,
109
  private_repo,
@@ -134,7 +136,7 @@ def process_pdfs(
134
  os.makedirs(images_dir)
135
 
136
  progress(0, desc="Starting PDF processing")
137
- images, message = pdf_to_images(pdf_files, sample_size, images_dir)
138
 
139
  # Create a new directory for sampled images
140
  sampled_images_dir = os.path.join(temp_dir, "sampled_images")
@@ -195,7 +197,9 @@ def process_pdfs(
195
  hf_repo=hf_repo,
196
  num_images=len(images),
197
  num_pdfs=len(pdf_files),
198
- sample_size=sample_size if sample_size > 0 else "All pages",
 
 
199
  creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
200
  )
201
 
@@ -204,7 +208,9 @@ def process_pdfs(
204
  hf_repo=hf_repo,
205
  num_images=len(images),
206
  num_pdfs=len(pdf_files),
207
- sample_size=sample_size if sample_size > 0 else "All pages",
 
 
208
  creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
209
  size_category=size_category,
210
  )
@@ -248,10 +254,13 @@ with gr.Blocks() as demo:
248
  file_count="multiple", label="Upload PDF(s)", file_types=["*.pdf"]
249
  )
250
  with gr.Row():
251
- sample_size = gr.Number(
252
- value=None,
253
- label="Pages per PDF (0 for all pages)",
254
- info="Specify how many pages to convert from each PDF. Use 0 to convert all pages.",
 
 
 
255
  )
256
  hf_repo = gr.Textbox(
257
  label="Hugging Face Repo",
@@ -269,7 +278,7 @@ with gr.Blocks() as demo:
269
  submit_button = gr.Button("Convert PDFs to page images")
270
  submit_button.click(
271
  process_pdfs,
272
- inputs=[pdf_files, sample_size, hf_repo, create_zip, private_repo],
273
  outputs=[output_gallery, download_button, status_text],
274
  )
275
 
 
21
  MAX_WORKERS = min(32, CPU_COUNT) # Use CPU count directly for processes
22
 
23
 
24
+ def process_pdf(pdf_file, sample_percentage, temp_dir):
25
  try:
26
  pdf_path = pdf_file.name
27
  doc = fitz.open(pdf_path)
28
  total_pages = len(doc)
29
 
30
+ pages_to_convert = int(total_pages * (sample_percentage / 100))
31
+ pages_to_convert = max(
32
+ 1, min(pages_to_convert, total_pages)
33
+ ) # Ensure at least one page and not more than total pages
34
+
35
  selected_pages = (
36
  sorted(random.sample(range(total_pages), pages_to_convert))
37
+ if 0 < sample_percentage < 100
38
  else range(total_pages)
39
  )
40
 
41
  images = []
42
  for page_num in selected_pages:
43
  page = doc[page_num]
44
+ pix = page.get_pixmap() # Remove the Matrix scaling
45
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
46
  image_path = os.path.join(
47
  temp_dir, f"{os.path.basename(pdf_path)}_page_{page_num+1}.jpg"
 
55
  return [], f"Error processing {pdf_file.name}: {str(e)}", 0
56
 
57
 
58
+ def pdf_to_images(pdf_files, sample_percentage, temp_dir, progress=gr.Progress()):
59
  if not os.path.exists(temp_dir):
60
  os.makedirs(temp_dir)
61
 
 
68
 
69
  with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
70
  future_to_pdf = {
71
+ executor.submit(process_pdf, pdf, sample_percentage, temp_dir): pdf
72
  for pdf in pdf_files
73
  }
74
 
 
105
 
106
  def process_pdfs(
107
  pdf_files,
108
+ sample_percentage,
109
  hf_repo,
110
  create_zip,
111
  private_repo,
 
136
  os.makedirs(images_dir)
137
 
138
  progress(0, desc="Starting PDF processing")
139
+ images, message = pdf_to_images(pdf_files, sample_percentage, images_dir)
140
 
141
  # Create a new directory for sampled images
142
  sampled_images_dir = os.path.join(temp_dir, "sampled_images")
 
197
  hf_repo=hf_repo,
198
  num_images=len(images),
199
  num_pdfs=len(pdf_files),
200
+ sample_size=sample_percentage
201
+ if sample_percentage > 0
202
+ else "All pages",
203
  creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
204
  )
205
 
 
208
  hf_repo=hf_repo,
209
  num_images=len(images),
210
  num_pdfs=len(pdf_files),
211
+ sample_size=sample_percentage
212
+ if sample_percentage > 0
213
+ else "All pages",
214
  creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
215
  size_category=size_category,
216
  )
 
254
  file_count="multiple", label="Upload PDF(s)", file_types=["*.pdf"]
255
  )
256
  with gr.Row():
257
+ sample_percentage = gr.Slider(
258
+ minimum=0,
259
+ maximum=100,
260
+ value=100,
261
+ step=1,
262
+ label="Percentage of pages to sample per PDF",
263
+ info="0% for no sampling (all pages), 100% for all pages",
264
  )
265
  hf_repo = gr.Textbox(
266
  label="Hugging Face Repo",
 
278
  submit_button = gr.Button("Convert PDFs to page images")
279
  submit_button.click(
280
  process_pdfs,
281
+ inputs=[pdf_files, sample_percentage, hf_repo, create_zip, private_repo],
282
  outputs=[output_gallery, download_button, status_text],
283
  )
284