Spaces:
Running
on
Zero
Running
on
Zero
Gabriel
commited on
Commit
·
5e081d3
1
Parent(s):
4846384
addded pycurl instead for requests
Browse files- README.md +1 -1
- app/content/main_sub_title_hum.md +1 -1
- app/content/sidebar.md +2 -7
- app/main.py +0 -1
- app/tabs/submit.py +107 -52
- pyproject.toml +1 -0
- requirements.txt +2 -1
README.md
CHANGED
@@ -11,5 +11,5 @@ short_description: htrflow demo app
|
|
11 |
header: mini
|
12 |
thumbnail: >-
|
13 |
https://cdn-uploads.huggingface.co/production/uploads/60a4e677917119d38f6bbff8/-qMf3PaegicobqW5hXyiA.png
|
14 |
-
sdk_version: 5.
|
15 |
---
|
|
|
11 |
header: mini
|
12 |
thumbnail: >-
|
13 |
https://cdn-uploads.huggingface.co/production/uploads/60a4e677917119d38f6bbff8/-qMf3PaegicobqW5hXyiA.png
|
14 |
+
sdk_version: 5.20.1
|
15 |
---
|
app/content/main_sub_title_hum.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
<a href="https://www.huminfra.se/">
|
2 |
<img src="https://www.umu.se/globalassets/qbank/huminfra-till-webben-48568w600h600.jpg"
|
3 |
-
width="
|
4 |
style="margin-top: 0.3rem; border-radius: 0.5rem;" />
|
5 |
</a>
|
|
|
1 |
<a href="https://www.huminfra.se/">
|
2 |
<img src="https://www.umu.se/globalassets/qbank/huminfra-till-webben-48568w600h600.jpg"
|
3 |
+
width="50%"
|
4 |
style="margin-top: 0.3rem; border-radius: 0.5rem;" />
|
5 |
</a>
|
app/content/sidebar.md
CHANGED
@@ -1,14 +1,9 @@
|
|
1 |
## HTRflow Demo
|
2 |
|
3 |
-
Welcome to the **HTRflow Demo** – a web application developed by the **National Archives of Sweden** in collaboration with [Huminfra](https://www.huminfra.se/). This demo lets you explore how AI transforms historical manuscripts into digital text using [HTRflow](https://ai-riksarkivet.github.io/htrflow/latest).
|
4 |
-
|
5 |
-
### How It Works
|
6 |
-
|
7 |
-
1. **Upload:** Submit your own image to run through HTRflow.
|
8 |
-
2. **Results:** View the transcribed text generated by the system.
|
9 |
-
3. **Export:** Choose your preferred format and download your results.
|
10 |
|
11 |
> Note: This demo application is for demonstration purposes only and is not intended for production use.
|
|
|
12 |
|
13 |
### Contact
|
14 |
|
|
|
1 |
## HTRflow Demo
|
2 |
|
3 |
+
Welcome to the **HTRflow Demo** – a web application developed by the **National Archives of Sweden** in collaboration with [Huminfra](https://www.huminfra.se/). This demo lets you explore how AI transforms historical manuscripts into digital text using [HTRflow](https://ai-riksarkivet.github.io/htrflow/latest).
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
> Note: This demo application is for demonstration purposes only and is not intended for production use.
|
6 |
+
> The application is hosted on Hugging Face 🤗 using shared infrastructure, which means there is a daily quota limit on how much you can use the app each day.
|
7 |
|
8 |
### Contact
|
9 |
|
app/main.py
CHANGED
@@ -2,7 +2,6 @@ import logging
|
|
2 |
import os
|
3 |
|
4 |
import gradio as gr
|
5 |
-
from htrflow.models.huggingface.trocr import TrOCR
|
6 |
|
7 |
from app.gradio_config import css, theme
|
8 |
from app.tabs.export import collection as collection_export_state
|
|
|
2 |
import os
|
3 |
|
4 |
import gradio as gr
|
|
|
5 |
|
6 |
from app.gradio_config import css, theme
|
7 |
from app.tabs.export import collection as collection_export_state
|
app/tabs/submit.py
CHANGED
@@ -1,10 +1,12 @@
|
|
|
|
1 |
import logging
|
2 |
import os
|
3 |
-
import time
|
4 |
import re
|
5 |
-
import
|
6 |
|
|
|
7 |
import gradio as gr
|
|
|
8 |
import spaces
|
9 |
import yaml
|
10 |
from gradio_modal import Modal
|
@@ -34,12 +36,7 @@ class PipelineWithProgress(Pipeline):
|
|
34 |
@classmethod
|
35 |
def from_config(cls, config: dict[str, str]):
|
36 |
"""Init pipeline from config, ensuring the correct subclass is instantiated."""
|
37 |
-
return cls(
|
38 |
-
[
|
39 |
-
init_step(step["step"], step.get("settings", {}))
|
40 |
-
for step in config["steps"]
|
41 |
-
]
|
42 |
-
)
|
43 |
|
44 |
def run(self, collection, start=0, progress=None):
|
45 |
"""
|
@@ -67,7 +64,7 @@ class PipelineWithProgress(Pipeline):
|
|
67 |
return collection
|
68 |
|
69 |
|
70 |
-
@spaces.GPU
|
71 |
def run_htrflow(custom_template_yaml, batch_image_gallery, progress=gr.Progress()):
|
72 |
"""
|
73 |
Executes the HTRflow pipeline based on the provided YAML configuration and batch images.
|
@@ -98,9 +95,7 @@ def run_htrflow(custom_template_yaml, batch_image_gallery, progress=gr.Progress(
|
|
98 |
|
99 |
pipe = PipelineWithProgress.from_config(config)
|
100 |
|
101 |
-
gr.Info(
|
102 |
-
f"HTRflow: processing {len(images)} {'image' if len(images) == 1 else 'images'}."
|
103 |
-
)
|
104 |
progress(0.1, desc="HTRflow: Processing")
|
105 |
|
106 |
collection.label = "demo_output"
|
@@ -164,44 +159,97 @@ def get_image_from_image_id(image_id):
|
|
164 |
return [f"https://lbiiif.riksarkivet.se/arkis!{image_id}/full/max/0/default.jpg"]
|
165 |
|
166 |
|
167 |
-
def get_images_from_iiif_manifest(iiif_manifest_url):
|
168 |
-
|
169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
Arguments:
|
172 |
-
|
173 |
-
height: Max height of returned images
|
|
|
174 |
"""
|
175 |
try:
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
# Hacky solution to get all images regardless of API version - treat
|
183 |
# the manifest as a string and match everything that looks like an IIIF
|
184 |
# image URL.
|
185 |
-
manifest = response.text
|
186 |
pattern = r'(?P<identifier>https?://[^"\s]*)/(?P<region>[^"\s]*?)/(?P<size>[^"\s]*?)/(?P<rotation>!?\d*?)/(?P<quality>[^"\s]*?)\.(?P<format>jpg|tif|png|gif|jp2|pdf|webp)'
|
187 |
-
|
188 |
-
|
189 |
images = set() # create a set to eliminate duplicates (e.g. thumbnails and fullsize images)
|
|
|
190 |
for match in re.findall(pattern, manifest):
|
191 |
identifier, _, _, _, _, format_ = match
|
192 |
images.add(f"{identifier}/full/{height},/0/default.{format_}")
|
193 |
-
|
194 |
-
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
|
197 |
with gr.Blocks() as submit:
|
198 |
-
|
199 |
gr.Markdown("# Upload")
|
200 |
gr.Markdown("Select or upload the image you want to transcribe. You can upload up to five images at a time.")
|
201 |
|
202 |
collection_submit_state = gr.State()
|
203 |
|
204 |
-
|
205 |
with gr.Row(equal_height=True):
|
206 |
with gr.Column(scale=2):
|
207 |
batch_image_gallery = gr.Gallery(
|
@@ -238,26 +286,33 @@ with gr.Blocks() as submit:
|
|
238 |
)
|
239 |
|
240 |
with gr.Tab("IIIF Manifest"):
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
|
|
|
|
|
|
|
|
257 |
|
258 |
with gr.Tab("URL"):
|
259 |
-
image_url = gr.Textbox(
|
260 |
-
|
|
|
|
|
|
|
261 |
|
262 |
with gr.Column(variant="panel", elem_classes="panel-with-border"):
|
263 |
gr.Markdown("## Settings")
|
@@ -327,12 +382,12 @@ with gr.Blocks() as submit:
|
|
327 |
return gr.update(value=None)
|
328 |
return images
|
329 |
|
330 |
-
|
331 |
-
|
332 |
-
|
|
|
333 |
image_url.submit(lambda url: [url], image_url, batch_image_gallery)
|
334 |
|
335 |
-
|
336 |
run_button.click(
|
337 |
fn=run_htrflow,
|
338 |
inputs=[custom_template_yaml, batch_image_gallery],
|
|
|
1 |
+
import io
|
2 |
import logging
|
3 |
import os
|
|
|
4 |
import re
|
5 |
+
import time
|
6 |
|
7 |
+
import certifi
|
8 |
import gradio as gr
|
9 |
+
import pycurl
|
10 |
import spaces
|
11 |
import yaml
|
12 |
from gradio_modal import Modal
|
|
|
36 |
@classmethod
|
37 |
def from_config(cls, config: dict[str, str]):
|
38 |
"""Init pipeline from config, ensuring the correct subclass is instantiated."""
|
39 |
+
return cls([init_step(step["step"], step.get("settings", {})) for step in config["steps"]])
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
def run(self, collection, start=0, progress=None):
|
42 |
"""
|
|
|
64 |
return collection
|
65 |
|
66 |
|
67 |
+
# @spaces.GPU
|
68 |
def run_htrflow(custom_template_yaml, batch_image_gallery, progress=gr.Progress()):
|
69 |
"""
|
70 |
Executes the HTRflow pipeline based on the provided YAML configuration and batch images.
|
|
|
95 |
|
96 |
pipe = PipelineWithProgress.from_config(config)
|
97 |
|
98 |
+
gr.Info(f"HTRflow: processing {len(images)} {'image' if len(images) == 1 else 'images'}.")
|
|
|
|
|
99 |
progress(0.1, desc="HTRflow: Processing")
|
100 |
|
101 |
collection.label = "demo_output"
|
|
|
159 |
return [f"https://lbiiif.riksarkivet.se/arkis!{image_id}/full/max/0/default.jpg"]
|
160 |
|
161 |
|
162 |
+
# def get_images_from_iiif_manifest(iiif_manifest_url):
|
163 |
+
# """
|
164 |
+
# Read all images from a v2/v3 IIIF manifest.
|
165 |
+
|
166 |
+
# Arguments:
|
167 |
+
# manifest: IIIF manifest
|
168 |
+
# height: Max height of returned images.
|
169 |
+
# """
|
170 |
+
# try:
|
171 |
+
# response = requests.get(iiif_manifest_url, timeout=5)
|
172 |
+
# response.raise_for_status()
|
173 |
+
# except (requests.HTTPError, requests.ConnectionError) as e:
|
174 |
+
# gr.Error(f"Could not fetch IIIF manifest from {iiif_manifest_url} ({e})")
|
175 |
+
# return
|
176 |
+
|
177 |
+
# # Hacky solution to get all images regardless of API version - treat
|
178 |
+
# # the manifest as a string and match everything that looks like an IIIF
|
179 |
+
# # image URL.
|
180 |
+
# manifest = response.text
|
181 |
+
# pattern = r'(?P<identifier>https?://[^"\s]*)/(?P<region>[^"\s]*?)/(?P<size>[^"\s]*?)/(?P<rotation>!?\d*?)/(?P<quality>[^"\s]*?)\.(?P<format>jpg|tif|png|gif|jp2|pdf|webp)'
|
182 |
+
# height= 1200
|
183 |
+
|
184 |
+
# images = set() # create a set to eliminate duplicates (e.g. thumbnails and fullsize images)
|
185 |
+
# for match in re.findall(pattern, manifest):
|
186 |
+
# identifier, _, _, _, _, format_ = match
|
187 |
+
# images.add(f"{identifier}/full/{height},/0/default.{format_}")
|
188 |
|
189 |
+
# return sorted(images)
|
190 |
+
|
191 |
+
|
192 |
+
def get_images_from_iiif_manifest(iiif_manifest_url, max_images=20, height=1200):
|
193 |
+
"""
|
194 |
+
Read images from a v2/v3 IIIF manifest, limited to max_images.
|
195 |
+
|
196 |
Arguments:
|
197 |
+
iiif_manifest_url: URL to IIIF manifest
|
198 |
+
height: Max height of returned images
|
199 |
+
max_images: Maximum number of images to return (default: 20)
|
200 |
"""
|
201 |
try:
|
202 |
+
buffer = io.BytesIO()
|
203 |
+
c = pycurl.Curl()
|
204 |
+
|
205 |
+
c.setopt(c.URL, iiif_manifest_url)
|
206 |
+
c.setopt(c.WRITEDATA, buffer)
|
207 |
+
c.setopt(c.CAINFO, certifi.where())
|
208 |
+
c.setopt(c.FOLLOWLOCATION, 1)
|
209 |
+
c.setopt(c.MAXREDIRS, 5)
|
210 |
+
c.setopt(c.CONNECTTIMEOUT, 5)
|
211 |
+
c.setopt(c.TIMEOUT, 10)
|
212 |
+
c.setopt(c.NOSIGNAL, 1)
|
213 |
+
c.setopt(c.USERAGENT, "curl/7.68.0")
|
214 |
+
|
215 |
+
c.perform()
|
216 |
+
|
217 |
+
http_code = c.getinfo(c.RESPONSE_CODE)
|
218 |
+
if http_code != 200:
|
219 |
+
raise Exception(f"HTTP Error: {http_code}")
|
220 |
+
|
221 |
+
manifest = buffer.getvalue().decode("utf-8")
|
222 |
+
c.close()
|
223 |
+
|
224 |
+
except pycurl.error as e:
|
225 |
+
error_code, error_msg = e.args
|
226 |
+
raise Exception(f"Could not fetch IIIF manifest from {iiif_manifest_url} ({error_msg})")
|
227 |
+
|
228 |
# Hacky solution to get all images regardless of API version - treat
|
229 |
# the manifest as a string and match everything that looks like an IIIF
|
230 |
# image URL.
|
|
|
231 |
pattern = r'(?P<identifier>https?://[^"\s]*)/(?P<region>[^"\s]*?)/(?P<size>[^"\s]*?)/(?P<rotation>!?\d*?)/(?P<quality>[^"\s]*?)\.(?P<format>jpg|tif|png|gif|jp2|pdf|webp)'
|
232 |
+
|
|
|
233 |
images = set() # create a set to eliminate duplicates (e.g. thumbnails and fullsize images)
|
234 |
+
|
235 |
for match in re.findall(pattern, manifest):
|
236 |
identifier, _, _, _, _, format_ = match
|
237 |
images.add(f"{identifier}/full/{height},/0/default.{format_}")
|
238 |
+
|
239 |
+
# Stop adding images if we've reached the maximum
|
240 |
+
if len(images) >= max_images:
|
241 |
+
break
|
242 |
+
|
243 |
+
# Sort and limit the results to max_images
|
244 |
+
return sorted(images)[:max_images], gr.update(visible=True)
|
245 |
|
246 |
|
247 |
with gr.Blocks() as submit:
|
|
|
248 |
gr.Markdown("# Upload")
|
249 |
gr.Markdown("Select or upload the image you want to transcribe. You can upload up to five images at a time.")
|
250 |
|
251 |
collection_submit_state = gr.State()
|
252 |
|
|
|
253 |
with gr.Row(equal_height=True):
|
254 |
with gr.Column(scale=2):
|
255 |
batch_image_gallery = gr.Gallery(
|
|
|
286 |
)
|
287 |
|
288 |
with gr.Tab("IIIF Manifest"):
|
289 |
+
with gr.Group():
|
290 |
+
iiif_manifest_url = gr.Textbox(
|
291 |
+
label="IIIF Manifest",
|
292 |
+
info=(
|
293 |
+
"Use an image from a IIIF manifest by pasting a IIIF manifest URL. Press enter to submit."
|
294 |
+
),
|
295 |
+
placeholder="",
|
296 |
+
scale=0
|
297 |
+
)
|
298 |
+
max_images_iiif_manifest= gr.Number(value=20, min_width=50, scale=0,
|
299 |
+
label="Number of image to return from IIIF manifest",
|
300 |
+
minimum=1, visible=False)
|
301 |
+
iiif_gallery = gr.Gallery(
|
302 |
+
interactive=False,
|
303 |
+
columns=4,
|
304 |
+
allow_preview=False,
|
305 |
+
container=False,
|
306 |
+
show_label=False,
|
307 |
+
object_fit="scale-down",
|
308 |
+
)
|
309 |
|
310 |
with gr.Tab("URL"):
|
311 |
+
image_url = gr.Textbox(
|
312 |
+
label="Image URL",
|
313 |
+
info="Upload an image by pasting its URL.",
|
314 |
+
placeholder="https://example.com/image.jpg",
|
315 |
+
)
|
316 |
|
317 |
with gr.Column(variant="panel", elem_classes="panel-with-border"):
|
318 |
gr.Markdown("## Settings")
|
|
|
382 |
return gr.update(value=None)
|
383 |
return images
|
384 |
|
385 |
+
image_id.submit(get_image_from_image_id, image_id, batch_image_gallery).then(
|
386 |
+
fn=lambda: "Swedish - Spreads", outputs=pipeline_dropdown
|
387 |
+
)
|
388 |
+
iiif_manifest_url.submit(get_images_from_iiif_manifest, [iiif_manifest_url, max_images_iiif_manifest], [iiif_gallery, max_images_iiif_manifest])
|
389 |
image_url.submit(lambda url: [url], image_url, batch_image_gallery)
|
390 |
|
|
|
391 |
run_button.click(
|
392 |
fn=run_htrflow,
|
393 |
inputs=[custom_template_yaml, batch_image_gallery],
|
pyproject.toml
CHANGED
@@ -23,6 +23,7 @@ dependencies = [
|
|
23 |
"gradio-modal>=0.0.4",
|
24 |
"dill>=0.3.9",
|
25 |
"spaces>=0.32.0",
|
|
|
26 |
]
|
27 |
|
28 |
[project.urls]
|
|
|
23 |
"gradio-modal>=0.0.4",
|
24 |
"dill>=0.3.9",
|
25 |
"spaces>=0.32.0",
|
26 |
+
"pycurl",
|
27 |
]
|
28 |
|
29 |
[project.urls]
|
requirements.txt
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
htrflow==0.2.5
|
2 |
-
gradio>=5.
|
3 |
tqdm>=4.67.1
|
4 |
gradio-modal>=0.0.4
|
5 |
dill>=0.3.9
|
|
|
|
1 |
htrflow==0.2.5
|
2 |
+
gradio>=5.20.1
|
3 |
tqdm>=4.67.1
|
4 |
gradio-modal>=0.0.4
|
5 |
dill>=0.3.9
|
6 |
+
pycurl>=7.45.6
|