Spaces:
Running
on
Zero
Running
on
Zero
carpelan
commited on
Commit
·
ad688d5
1
Parent(s):
d3fc046
fixed with pymupdf
Browse files- app/tabs/submit.py +41 -2
- pyproject.toml +1 -1
- requirements.txt +1 -1
- uv.lock +17 -14
app/tabs/submit.py
CHANGED
@@ -5,6 +5,7 @@ import re
|
|
5 |
import time
|
6 |
|
7 |
import certifi
|
|
|
8 |
import gradio as gr
|
9 |
import pycurl
|
10 |
import spaces
|
@@ -13,7 +14,7 @@ from gradio_modal import Modal
|
|
13 |
from htrflow.pipeline.pipeline import Pipeline
|
14 |
from htrflow.pipeline.steps import init_step
|
15 |
from htrflow.volume.volume import Collection
|
16 |
-
from
|
17 |
|
18 |
from app.pipelines import PIPELINES
|
19 |
|
@@ -70,6 +71,44 @@ class PipelineWithProgress(Pipeline):
|
|
70 |
return collection
|
71 |
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
@spaces.GPU
|
74 |
def run_htrflow(custom_template_yaml, batch_image_gallery, progress=gr.Progress()):
|
75 |
"""
|
@@ -424,7 +463,7 @@ with gr.Blocks() as submit:
|
|
424 |
image_url.submit(lambda url: [url], image_url, batch_image_gallery)
|
425 |
|
426 |
pdf_file.upload(
|
427 |
-
lambda imgs:
|
428 |
)
|
429 |
|
430 |
run_button.click(
|
|
|
5 |
import time
|
6 |
|
7 |
import certifi
|
8 |
+
import fitz # PyMuPDF
|
9 |
import gradio as gr
|
10 |
import pycurl
|
11 |
import spaces
|
|
|
14 |
from htrflow.pipeline.pipeline import Pipeline
|
15 |
from htrflow.pipeline.steps import init_step
|
16 |
from htrflow.volume.volume import Collection
|
17 |
+
from PIL import Image
|
18 |
|
19 |
from app.pipelines import PIPELINES
|
20 |
|
|
|
71 |
return collection
|
72 |
|
73 |
|
74 |
+
def pdf_to_images(pdf_path):
|
75 |
+
"""
|
76 |
+
Convert a PDF file to a list of PIL Image objects using PyMuPDF.
|
77 |
+
Extracts full-resolution images with no DPI adjustment.
|
78 |
+
|
79 |
+
Args:
|
80 |
+
pdf_path (str): Path to the PDF file
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
list: List of PIL Image objects
|
84 |
+
"""
|
85 |
+
# Open the PDF
|
86 |
+
pdf_document = fitz.open(pdf_path)
|
87 |
+
|
88 |
+
# List to store the images
|
89 |
+
images = []
|
90 |
+
|
91 |
+
# Iterate through each page
|
92 |
+
for page_num in range(len(pdf_document)):
|
93 |
+
# Get the page
|
94 |
+
page = pdf_document[page_num]
|
95 |
+
|
96 |
+
# Get the pixmap at default resolution
|
97 |
+
pixmap = page.get_pixmap(alpha=False)
|
98 |
+
|
99 |
+
# Convert pixmap to PIL Image
|
100 |
+
img_data = pixmap.tobytes("jpeg")
|
101 |
+
img = Image.open(io.BytesIO(img_data))
|
102 |
+
|
103 |
+
# Add the image to our list
|
104 |
+
images.append(img)
|
105 |
+
|
106 |
+
# Close the PDF
|
107 |
+
pdf_document.close()
|
108 |
+
|
109 |
+
return images
|
110 |
+
|
111 |
+
|
112 |
@spaces.GPU
|
113 |
def run_htrflow(custom_template_yaml, batch_image_gallery, progress=gr.Progress()):
|
114 |
"""
|
|
|
463 |
image_url.submit(lambda url: [url], image_url, batch_image_gallery)
|
464 |
|
465 |
pdf_file.upload(
|
466 |
+
lambda imgs: pdf_to_images(imgs), inputs=pdf_file, outputs=pdf_gallery
|
467 |
)
|
468 |
|
469 |
run_button.click(
|
pyproject.toml
CHANGED
@@ -24,7 +24,7 @@ dependencies = [
|
|
24 |
"dill>=0.3.9",
|
25 |
"spaces>=0.32.0",
|
26 |
"pycurl",
|
27 |
-
"
|
28 |
]
|
29 |
|
30 |
[project.urls]
|
|
|
24 |
"dill>=0.3.9",
|
25 |
"spaces>=0.32.0",
|
26 |
"pycurl",
|
27 |
+
"pymupdf>=1.25.3",
|
28 |
]
|
29 |
|
30 |
[project.urls]
|
requirements.txt
CHANGED
@@ -4,4 +4,4 @@ tqdm>=4.67.1
|
|
4 |
gradio-modal>=0.0.4
|
5 |
dill>=0.3.9
|
6 |
pycurl>=7.45.6
|
7 |
-
|
|
|
4 |
gradio-modal>=0.0.4
|
5 |
dill>=0.3.9
|
6 |
pycurl>=7.45.6
|
7 |
+
pymupdf>=1.25.3
|
uv.lock
CHANGED
@@ -543,8 +543,8 @@ dependencies = [
|
|
543 |
{ name = "gradio" },
|
544 |
{ name = "gradio-modal" },
|
545 |
{ name = "htrflow" },
|
546 |
-
{ name = "pdf2image" },
|
547 |
{ name = "pycurl" },
|
|
|
548 |
{ name = "spaces" },
|
549 |
{ name = "tqdm" },
|
550 |
]
|
@@ -562,8 +562,8 @@ requires-dist = [
|
|
562 |
{ name = "gradio", specifier = ">=5.17.0" },
|
563 |
{ name = "gradio-modal", specifier = ">=0.0.4" },
|
564 |
{ name = "htrflow", specifier = "==0.2.5" },
|
565 |
-
{ name = "pdf2image", specifier = ">=1.17.0" },
|
566 |
{ name = "pycurl" },
|
|
|
567 |
{ name = "spaces", specifier = ">=0.32.0" },
|
568 |
{ name = "tqdm", specifier = ">=4.67.1" },
|
569 |
]
|
@@ -1185,18 +1185,6 @@ wheels = [
|
|
1185 |
{ url = "https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248 },
|
1186 |
]
|
1187 |
|
1188 |
-
[[package]]
|
1189 |
-
name = "pdf2image"
|
1190 |
-
version = "1.17.0"
|
1191 |
-
source = { registry = "https://pypi.org/simple" }
|
1192 |
-
dependencies = [
|
1193 |
-
{ name = "pillow" },
|
1194 |
-
]
|
1195 |
-
sdist = { url = "https://files.pythonhosted.org/packages/00/d8/b280f01045555dc257b8153c00dee3bc75830f91a744cd5f84ef3a0a64b1/pdf2image-1.17.0.tar.gz", hash = "sha256:eaa959bc116b420dd7ec415fcae49b98100dda3dd18cd2fdfa86d09f112f6d57", size = 12811 }
|
1196 |
-
wheels = [
|
1197 |
-
{ url = "https://files.pythonhosted.org/packages/62/33/61766ae033518957f877ab246f87ca30a85b778ebaad65b7f74fa7e52988/pdf2image-1.17.0-py3-none-any.whl", hash = "sha256:ecdd58d7afb810dffe21ef2b1bbc057ef434dabbac6c33778a38a3f7744a27e2", size = 11618 },
|
1198 |
-
]
|
1199 |
-
|
1200 |
[[package]]
|
1201 |
name = "pfzy"
|
1202 |
version = "0.3.4"
|
@@ -1492,6 +1480,21 @@ wheels = [
|
|
1492 |
{ url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 },
|
1493 |
]
|
1494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1495 |
[[package]]
|
1496 |
name = "pyparsing"
|
1497 |
version = "3.2.1"
|
|
|
543 |
{ name = "gradio" },
|
544 |
{ name = "gradio-modal" },
|
545 |
{ name = "htrflow" },
|
|
|
546 |
{ name = "pycurl" },
|
547 |
+
{ name = "pymupdf" },
|
548 |
{ name = "spaces" },
|
549 |
{ name = "tqdm" },
|
550 |
]
|
|
|
562 |
{ name = "gradio", specifier = ">=5.17.0" },
|
563 |
{ name = "gradio-modal", specifier = ">=0.0.4" },
|
564 |
{ name = "htrflow", specifier = "==0.2.5" },
|
|
|
565 |
{ name = "pycurl" },
|
566 |
+
{ name = "pymupdf", specifier = ">=1.25.3" },
|
567 |
{ name = "spaces", specifier = ">=0.32.0" },
|
568 |
{ name = "tqdm", specifier = ">=4.67.1" },
|
569 |
]
|
|
|
1185 |
{ url = "https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248 },
|
1186 |
]
|
1187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1188 |
[[package]]
|
1189 |
name = "pfzy"
|
1190 |
version = "0.3.4"
|
|
|
1480 |
{ url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 },
|
1481 |
]
|
1482 |
|
1483 |
+
[[package]]
|
1484 |
+
name = "pymupdf"
|
1485 |
+
version = "1.25.3"
|
1486 |
+
source = { registry = "https://pypi.org/simple" }
|
1487 |
+
sdist = { url = "https://files.pythonhosted.org/packages/06/47/b61c1c44b87cbdaeecdec3f43ce524ed6b3c72172bc6184eb82c94fbc43d/pymupdf-1.25.3.tar.gz", hash = "sha256:b640187c64c5ac5d97505a92e836da299da79c2f689f3f94a67a37a493492193", size = 67259841 }
|
1488 |
+
wheels = [
|
1489 |
+
{ url = "https://files.pythonhosted.org/packages/61/9b/98ef4b98309e9db3baa9fe572f0e61b6130bb9852d13189970f35b703499/pymupdf-1.25.3-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:96878e1b748f9c2011aecb2028c5f96b5a347a9a91169130ad0133053d97915e", size = 19343576 },
|
1490 |
+
{ url = "https://files.pythonhosted.org/packages/14/62/4e12126db174c8cfbf692281cda971cc4046c5f5226032c2cfaa6f83e08d/pymupdf-1.25.3-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:6ef753005b72ebfd23470f72f7e30f61e21b0b5e748045ec5b8f89e6e3068d62", size = 18580114 },
|
1491 |
+
{ url = "https://files.pythonhosted.org/packages/ec/c5/cf7ecf005e4f8ba3664d6aaa0613adeba4c2ab524832c452c69857e7184f/pymupdf-1.25.3-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cbff443d899f37b17f1e67563cc03673d50b4bf33ccc237e73d34f18f3a07ccf", size = 19442580 },
|
1492 |
+
{ url = "https://files.pythonhosted.org/packages/52/de/bd1418e31f73d37b8381cd5deacfd681e6be702b8890e123e83724569ee1/pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:46d90c4f9e62d1856e8db4b9f04a202ff4a7f086a816af73abdc86adb7f5e25a", size = 19999825 },
|
1493 |
+
{ url = "https://files.pythonhosted.org/packages/42/ee/3c449b0de061440ba1ac984aa845315e9e2dca0ff2003c5adfc6febff203/pymupdf-1.25.3-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a5de51efdbe4d486b6c1111c84e8a231cbfb426f3d6ff31ab530ad70e6f39756", size = 21123157 },
|
1494 |
+
{ url = "https://files.pythonhosted.org/packages/83/53/71faaaf91c56f2883b13f3dd849bf2697f012eb35eb7b952d62734cff41f/pymupdf-1.25.3-cp39-abi3-win32.whl", hash = "sha256:bca72e6089f985d800596e22973f79cc08af6cbff1d93e5bda9248326a03857c", size = 15094211 },
|
1495 |
+
{ url = "https://files.pythonhosted.org/packages/09/e0/d72e88a1d5e23aa381fd463057dc3d0fb29090e1e7308a870c334716579c/pymupdf-1.25.3-cp39-abi3-win_amd64.whl", hash = "sha256:4fb357438c9129fbf939b5af85323434df64e36759c399c376b62ad6da95498c", size = 16542949 },
|
1496 |
+
]
|
1497 |
+
|
1498 |
[[package]]
|
1499 |
name = "pyparsing"
|
1500 |
version = "3.2.1"
|