davanstrien HF staff commited on
Commit
c3e4c21
1 Parent(s): 33de52b

process in parallel

Browse files
Files changed (1) hide show
  1. app.py +35 -19
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import random
3
  import shutil
@@ -9,13 +10,18 @@ from datetime import datetime
9
  import fitz # PyMuPDF
10
  import gradio as gr
11
  from huggingface_hub import DatasetCard, DatasetCardData, HfApi
 
12
 
13
  from dataset_card_template import DATASET_CARD_TEMPLATE
14
 
15
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
16
 
17
 
18
- def process_pdf(pdf_file, sample_size, temp_dir, progress=gr.Progress()):
 
 
 
 
19
  try:
20
  pdf_path = pdf_file.name
21
  doc = fitz.open(pdf_path)
@@ -31,21 +37,20 @@ def process_pdf(pdf_file, sample_size, temp_dir, progress=gr.Progress()):
31
  )
32
 
33
  images = []
34
- for page_num in progress.tqdm(
35
- selected_pages, desc=f"Converting {os.path.basename(pdf_path)}", unit="page"
36
- ):
37
  page = doc[page_num]
38
- pix = page.get_pixmap()
 
39
  image_path = os.path.join(
40
- temp_dir, f"{os.path.basename(pdf_path)}_page_{page_num+1}.png"
41
  )
42
- pix.save(image_path)
43
  images.append(image_path)
44
 
45
  doc.close()
46
- return images, None
47
  except Exception as e:
48
- return [], f"Error processing {pdf_file.name}: {str(e)}"
49
 
50
 
51
  def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
@@ -56,15 +61,26 @@ def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
56
  all_images = []
57
  skipped_pdfs = []
58
 
59
- for i, pdf_file in enumerate(
60
- progress.tqdm(pdf_files, desc="Converting PDFs", unit="PDF")
61
- ):
62
- images, error = process_pdf(pdf_file, sample_size, temp_dir, progress)
63
- if error:
64
- skipped_pdfs.append(pdf_file.name)
65
- gr.Info(error)
66
- else:
67
- all_images.extend(images)
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  message = f"Saved {len(all_images)} images to temporary directory"
70
  if skipped_pdfs:
@@ -138,7 +154,7 @@ def process_pdfs(
138
  repo_type="dataset",
139
  private=private_repo,
140
  )
141
- hf_api.upload_large_folder(
142
  folder_path=temp_dir,
143
  repo_id=hf_repo,
144
  repo_type="dataset",
 
1
+ import multiprocessing
2
  import os
3
  import random
4
  import shutil
 
10
  import fitz # PyMuPDF
11
  import gradio as gr
12
  from huggingface_hub import DatasetCard, DatasetCardData, HfApi
13
+ from PIL import Image
14
 
15
  from dataset_card_template import DATASET_CARD_TEMPLATE
16
 
17
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
18
 
19
 
20
+ CPU_COUNT = multiprocessing.cpu_count()
21
+ MAX_WORKERS = min(32, CPU_COUNT) # Use CPU count directly for processes
22
+
23
+
24
+ def process_pdf(pdf_file, sample_size, temp_dir):
25
  try:
26
  pdf_path = pdf_file.name
27
  doc = fitz.open(pdf_path)
 
37
  )
38
 
39
  images = []
40
+ for page_num in selected_pages:
 
 
41
  page = doc[page_num]
42
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Increase resolution
43
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
44
  image_path = os.path.join(
45
+ temp_dir, f"{os.path.basename(pdf_path)}_page_{page_num+1}.jpg"
46
  )
47
+ image.save(image_path, "JPEG", quality=85, optimize=True)
48
  images.append(image_path)
49
 
50
  doc.close()
51
+ return images, None, len(images)
52
  except Exception as e:
53
+ return [], f"Error processing {pdf_file.name}: {str(e)}", 0
54
 
55
 
56
  def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
 
61
  all_images = []
62
  skipped_pdfs = []
63
 
64
+ total_pages = sum(len(fitz.open(pdf.name)) for pdf in pdf_files)
65
+ processed_pages = 0
66
+
67
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
68
+ future_to_pdf = {
69
+ executor.submit(process_pdf, pdf, sample_size, temp_dir): pdf
70
+ for pdf in pdf_files
71
+ }
72
+
73
+ for future in as_completed(future_to_pdf):
74
+ pdf = future_to_pdf[future]
75
+ images, error, pages_processed = future.result()
76
+ if error:
77
+ skipped_pdfs.append(error)
78
+ gr.Info(error)
79
+ else:
80
+ all_images.extend(images)
81
+
82
+ processed_pages += pages_processed
83
+ progress((processed_pages / total_pages), desc=f"Processing {pdf.name}")
84
 
85
  message = f"Saved {len(all_images)} images to temporary directory"
86
  if skipped_pdfs:
 
154
  repo_type="dataset",
155
  private=private_repo,
156
  )
157
+ hf_api.upload_folder(
158
  folder_path=temp_dir,
159
  repo_id=hf_repo,
160
  repo_type="dataset",