Gabriel commited on
Commit
5e081d3
·
1 Parent(s): 4846384

addded pycurl instead for requests

Browse files
README.md CHANGED
@@ -11,5 +11,5 @@ short_description: htrflow demo app
11
  header: mini
12
  thumbnail: >-
13
  https://cdn-uploads.huggingface.co/production/uploads/60a4e677917119d38f6bbff8/-qMf3PaegicobqW5hXyiA.png
14
- sdk_version: 5.17.0
15
  ---
 
11
  header: mini
12
  thumbnail: >-
13
  https://cdn-uploads.huggingface.co/production/uploads/60a4e677917119d38f6bbff8/-qMf3PaegicobqW5hXyiA.png
14
+ sdk_version: 5.20.1
15
  ---
app/content/main_sub_title_hum.md CHANGED
@@ -1,5 +1,5 @@
1
  <a href="https://www.huminfra.se/">
2
  <img src="https://www.umu.se/globalassets/qbank/huminfra-till-webben-48568w600h600.jpg"
3
- width="60%"
4
  style="margin-top: 0.3rem; border-radius: 0.5rem;" />
5
  </a>
 
1
  <a href="https://www.huminfra.se/">
2
  <img src="https://www.umu.se/globalassets/qbank/huminfra-till-webben-48568w600h600.jpg"
3
+ width="50%"
4
  style="margin-top: 0.3rem; border-radius: 0.5rem;" />
5
  </a>
app/content/sidebar.md CHANGED
@@ -1,14 +1,9 @@
1
  ## HTRflow Demo
2
 
3
- Welcome to the **HTRflow Demo** – a web application developed by the **National Archives of Sweden** in collaboration with [Huminfra](https://www.huminfra.se/). This demo lets you explore how AI transforms historical manuscripts into digital text using [HTRflow](https://ai-riksarkivet.github.io/htrflow/latest).
4
-
5
- ### How It Works
6
-
7
- 1. **Upload:** Submit your own image to run through HTRflow.
8
- 2. **Results:** View the transcribed text generated by the system.
9
- 3. **Export:** Choose your preferred format and download your results.
10
 
11
  > Note: This demo application is for demonstration purposes only and is not intended for production use.
 
12
 
13
  ### Contact
14
 
 
1
  ## HTRflow Demo
2
 
3
+ Welcome to the **HTRflow Demo** – a web application developed by the **National Archives of Sweden** in collaboration with [Huminfra](https://www.huminfra.se/). This demo lets you explore how AI transforms historical manuscripts into digital text using [HTRflow](https://ai-riksarkivet.github.io/htrflow/latest).
 
 
 
 
 
 
4
 
5
  > Note: This demo application is for demonstration purposes only and is not intended for production use.
6
+ > The application is hosted on Hugging Face 🤗 using shared infrastructure, which means there is a daily quota limit on how much you can use the app each day.
7
 
8
  ### Contact
9
 
app/main.py CHANGED
@@ -2,7 +2,6 @@ import logging
2
  import os
3
 
4
  import gradio as gr
5
- from htrflow.models.huggingface.trocr import TrOCR
6
 
7
  from app.gradio_config import css, theme
8
  from app.tabs.export import collection as collection_export_state
 
2
  import os
3
 
4
  import gradio as gr
 
5
 
6
  from app.gradio_config import css, theme
7
  from app.tabs.export import collection as collection_export_state
app/tabs/submit.py CHANGED
@@ -1,10 +1,12 @@
 
1
  import logging
2
  import os
3
- import time
4
  import re
5
- import requests
6
 
 
7
  import gradio as gr
 
8
  import spaces
9
  import yaml
10
  from gradio_modal import Modal
@@ -34,12 +36,7 @@ class PipelineWithProgress(Pipeline):
34
  @classmethod
35
  def from_config(cls, config: dict[str, str]):
36
  """Init pipeline from config, ensuring the correct subclass is instantiated."""
37
- return cls(
38
- [
39
- init_step(step["step"], step.get("settings", {}))
40
- for step in config["steps"]
41
- ]
42
- )
43
 
44
  def run(self, collection, start=0, progress=None):
45
  """
@@ -67,7 +64,7 @@ class PipelineWithProgress(Pipeline):
67
  return collection
68
 
69
 
70
- @spaces.GPU
71
  def run_htrflow(custom_template_yaml, batch_image_gallery, progress=gr.Progress()):
72
  """
73
  Executes the HTRflow pipeline based on the provided YAML configuration and batch images.
@@ -98,9 +95,7 @@ def run_htrflow(custom_template_yaml, batch_image_gallery, progress=gr.Progress(
98
 
99
  pipe = PipelineWithProgress.from_config(config)
100
 
101
- gr.Info(
102
- f"HTRflow: processing {len(images)} {'image' if len(images) == 1 else 'images'}."
103
- )
104
  progress(0.1, desc="HTRflow: Processing")
105
 
106
  collection.label = "demo_output"
@@ -164,44 +159,97 @@ def get_image_from_image_id(image_id):
164
  return [f"https://lbiiif.riksarkivet.se/arkis!{image_id}/full/max/0/default.jpg"]
165
 
166
 
167
- def get_images_from_iiif_manifest(iiif_manifest_url):
168
- """
169
- Read all images from a v2/v3 IIIF manifest.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
 
 
 
 
 
 
 
171
  Arguments:
172
- manifest: IIIF manifest
173
- height: Max height of returned images.
 
174
  """
175
  try:
176
- response = requests.get(iiif_manifest_url, timeout=5)
177
- response.raise_for_status()
178
- except (requests.HTTPError, requests.ConnectionError) as e:
179
- gr.Error(f"Could not fetch IIIF manifest from {iiif_manifest_url} ({e})")
180
- return
181
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  # Hacky solution to get all images regardless of API version - treat
183
  # the manifest as a string and match everything that looks like an IIIF
184
  # image URL.
185
- manifest = response.text
186
  pattern = r'(?P<identifier>https?://[^"\s]*)/(?P<region>[^"\s]*?)/(?P<size>[^"\s]*?)/(?P<rotation>!?\d*?)/(?P<quality>[^"\s]*?)\.(?P<format>jpg|tif|png|gif|jp2|pdf|webp)'
187
- height= 1200
188
-
189
  images = set() # create a set to eliminate duplicates (e.g. thumbnails and fullsize images)
 
190
  for match in re.findall(pattern, manifest):
191
  identifier, _, _, _, _, format_ = match
192
  images.add(f"{identifier}/full/{height},/0/default.{format_}")
193
-
194
- return sorted(images)
 
 
 
 
 
195
 
196
 
197
  with gr.Blocks() as submit:
198
-
199
  gr.Markdown("# Upload")
200
  gr.Markdown("Select or upload the image you want to transcribe. You can upload up to five images at a time.")
201
 
202
  collection_submit_state = gr.State()
203
 
204
-
205
  with gr.Row(equal_height=True):
206
  with gr.Column(scale=2):
207
  batch_image_gallery = gr.Gallery(
@@ -238,26 +286,33 @@ with gr.Blocks() as submit:
238
  )
239
 
240
  with gr.Tab("IIIF Manifest"):
241
- iiif_manifest_url = gr.Textbox(
242
- label="IIIF Manifest",
243
- info=(
244
- "Use an image from a IIIF manifest by pasting a IIIF manifest URL. "
245
- "Press enter to submit."
246
- ),
247
- placeholder="",
248
- )
249
- iiif_gallery = gr.Gallery(
250
- interactive=False,
251
- columns=4,
252
- allow_preview=False,
253
- container=False,
254
- show_label=False,
255
- object_fit="scale-down",
256
- )
 
 
 
 
257
 
258
  with gr.Tab("URL"):
259
- image_url = gr.Textbox(label="Image URL", info="Upload an image by pasting its URL.", placeholder="https://example.com/image.jpg")
260
-
 
 
 
261
 
262
  with gr.Column(variant="panel", elem_classes="panel-with-border"):
263
  gr.Markdown("## Settings")
@@ -327,12 +382,12 @@ with gr.Blocks() as submit:
327
  return gr.update(value=None)
328
  return images
329
 
330
-
331
- image_id.submit(get_image_from_image_id, image_id, batch_image_gallery).then(fn=lambda: "Swedish - Spreads", outputs=pipeline_dropdown)
332
- iiif_manifest_url.submit(get_images_from_iiif_manifest, iiif_manifest_url, iiif_gallery)
 
333
  image_url.submit(lambda url: [url], image_url, batch_image_gallery)
334
 
335
-
336
  run_button.click(
337
  fn=run_htrflow,
338
  inputs=[custom_template_yaml, batch_image_gallery],
 
1
+ import io
2
  import logging
3
  import os
 
4
  import re
5
+ import time
6
 
7
+ import certifi
8
  import gradio as gr
9
+ import pycurl
10
  import spaces
11
  import yaml
12
  from gradio_modal import Modal
 
36
  @classmethod
37
  def from_config(cls, config: dict[str, str]):
38
  """Init pipeline from config, ensuring the correct subclass is instantiated."""
39
+ return cls([init_step(step["step"], step.get("settings", {})) for step in config["steps"]])
 
 
 
 
 
40
 
41
  def run(self, collection, start=0, progress=None):
42
  """
 
64
  return collection
65
 
66
 
67
+ # @spaces.GPU
68
  def run_htrflow(custom_template_yaml, batch_image_gallery, progress=gr.Progress()):
69
  """
70
  Executes the HTRflow pipeline based on the provided YAML configuration and batch images.
 
95
 
96
  pipe = PipelineWithProgress.from_config(config)
97
 
98
+ gr.Info(f"HTRflow: processing {len(images)} {'image' if len(images) == 1 else 'images'}.")
 
 
99
  progress(0.1, desc="HTRflow: Processing")
100
 
101
  collection.label = "demo_output"
 
159
  return [f"https://lbiiif.riksarkivet.se/arkis!{image_id}/full/max/0/default.jpg"]
160
 
161
 
162
+ # def get_images_from_iiif_manifest(iiif_manifest_url):
163
+ # """
164
+ # Read all images from a v2/v3 IIIF manifest.
165
+
166
+ # Arguments:
167
+ # manifest: IIIF manifest
168
+ # height: Max height of returned images.
169
+ # """
170
+ # try:
171
+ # response = requests.get(iiif_manifest_url, timeout=5)
172
+ # response.raise_for_status()
173
+ # except (requests.HTTPError, requests.ConnectionError) as e:
174
+ # gr.Error(f"Could not fetch IIIF manifest from {iiif_manifest_url} ({e})")
175
+ # return
176
+
177
+ # # Hacky solution to get all images regardless of API version - treat
178
+ # # the manifest as a string and match everything that looks like an IIIF
179
+ # # image URL.
180
+ # manifest = response.text
181
+ # pattern = r'(?P<identifier>https?://[^"\s]*)/(?P<region>[^"\s]*?)/(?P<size>[^"\s]*?)/(?P<rotation>!?\d*?)/(?P<quality>[^"\s]*?)\.(?P<format>jpg|tif|png|gif|jp2|pdf|webp)'
182
+ # height= 1200
183
+
184
+ # images = set() # create a set to eliminate duplicates (e.g. thumbnails and fullsize images)
185
+ # for match in re.findall(pattern, manifest):
186
+ # identifier, _, _, _, _, format_ = match
187
+ # images.add(f"{identifier}/full/{height},/0/default.{format_}")
188
 
189
+ # return sorted(images)
190
+
191
+
192
+ def get_images_from_iiif_manifest(iiif_manifest_url, max_images=20, height=1200):
193
+ """
194
+ Read images from a v2/v3 IIIF manifest, limited to max_images.
195
+
196
  Arguments:
197
+ iiif_manifest_url: URL to IIIF manifest
198
+ height: Max height of returned images
199
+ max_images: Maximum number of images to return (default: 20)
200
  """
201
  try:
202
+ buffer = io.BytesIO()
203
+ c = pycurl.Curl()
204
+
205
+ c.setopt(c.URL, iiif_manifest_url)
206
+ c.setopt(c.WRITEDATA, buffer)
207
+ c.setopt(c.CAINFO, certifi.where())
208
+ c.setopt(c.FOLLOWLOCATION, 1)
209
+ c.setopt(c.MAXREDIRS, 5)
210
+ c.setopt(c.CONNECTTIMEOUT, 5)
211
+ c.setopt(c.TIMEOUT, 10)
212
+ c.setopt(c.NOSIGNAL, 1)
213
+ c.setopt(c.USERAGENT, "curl/7.68.0")
214
+
215
+ c.perform()
216
+
217
+ http_code = c.getinfo(c.RESPONSE_CODE)
218
+ if http_code != 200:
219
+ raise Exception(f"HTTP Error: {http_code}")
220
+
221
+ manifest = buffer.getvalue().decode("utf-8")
222
+ c.close()
223
+
224
+ except pycurl.error as e:
225
+ error_code, error_msg = e.args
226
+ raise Exception(f"Could not fetch IIIF manifest from {iiif_manifest_url} ({error_msg})")
227
+
228
  # Hacky solution to get all images regardless of API version - treat
229
  # the manifest as a string and match everything that looks like an IIIF
230
  # image URL.
 
231
  pattern = r'(?P<identifier>https?://[^"\s]*)/(?P<region>[^"\s]*?)/(?P<size>[^"\s]*?)/(?P<rotation>!?\d*?)/(?P<quality>[^"\s]*?)\.(?P<format>jpg|tif|png|gif|jp2|pdf|webp)'
232
+
 
233
  images = set() # create a set to eliminate duplicates (e.g. thumbnails and fullsize images)
234
+
235
  for match in re.findall(pattern, manifest):
236
  identifier, _, _, _, _, format_ = match
237
  images.add(f"{identifier}/full/{height},/0/default.{format_}")
238
+
239
+ # Stop adding images if we've reached the maximum
240
+ if len(images) >= max_images:
241
+ break
242
+
243
+ # Sort and limit the results to max_images
244
+ return sorted(images)[:max_images], gr.update(visible=True)
245
 
246
 
247
  with gr.Blocks() as submit:
 
248
  gr.Markdown("# Upload")
249
  gr.Markdown("Select or upload the image you want to transcribe. You can upload up to five images at a time.")
250
 
251
  collection_submit_state = gr.State()
252
 
 
253
  with gr.Row(equal_height=True):
254
  with gr.Column(scale=2):
255
  batch_image_gallery = gr.Gallery(
 
286
  )
287
 
288
  with gr.Tab("IIIF Manifest"):
289
+ with gr.Group():
290
+ iiif_manifest_url = gr.Textbox(
291
+ label="IIIF Manifest",
292
+ info=(
293
+ "Use an image from a IIIF manifest by pasting a IIIF manifest URL. Press enter to submit."
294
+ ),
295
+ placeholder="",
296
+ scale=0
297
+ )
298
+ max_images_iiif_manifest= gr.Number(value=20, min_width=50, scale=0,
299
+ label="Number of image to return from IIIF manifest",
300
+ minimum=1, visible=False)
301
+ iiif_gallery = gr.Gallery(
302
+ interactive=False,
303
+ columns=4,
304
+ allow_preview=False,
305
+ container=False,
306
+ show_label=False,
307
+ object_fit="scale-down",
308
+ )
309
 
310
  with gr.Tab("URL"):
311
+ image_url = gr.Textbox(
312
+ label="Image URL",
313
+ info="Upload an image by pasting its URL.",
314
+ placeholder="https://example.com/image.jpg",
315
+ )
316
 
317
  with gr.Column(variant="panel", elem_classes="panel-with-border"):
318
  gr.Markdown("## Settings")
 
382
  return gr.update(value=None)
383
  return images
384
 
385
+ image_id.submit(get_image_from_image_id, image_id, batch_image_gallery).then(
386
+ fn=lambda: "Swedish - Spreads", outputs=pipeline_dropdown
387
+ )
388
+ iiif_manifest_url.submit(get_images_from_iiif_manifest, [iiif_manifest_url, max_images_iiif_manifest], [iiif_gallery, max_images_iiif_manifest])
389
  image_url.submit(lambda url: [url], image_url, batch_image_gallery)
390
 
 
391
  run_button.click(
392
  fn=run_htrflow,
393
  inputs=[custom_template_yaml, batch_image_gallery],
pyproject.toml CHANGED
@@ -23,6 +23,7 @@ dependencies = [
23
  "gradio-modal>=0.0.4",
24
  "dill>=0.3.9",
25
  "spaces>=0.32.0",
 
26
  ]
27
 
28
  [project.urls]
 
23
  "gradio-modal>=0.0.4",
24
  "dill>=0.3.9",
25
  "spaces>=0.32.0",
26
+ "pycurl",
27
  ]
28
 
29
  [project.urls]
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  htrflow==0.2.5
2
- gradio>=5.17.0
3
  tqdm>=4.67.1
4
  gradio-modal>=0.0.4
5
  dill>=0.3.9
 
 
1
  htrflow==0.2.5
2
+ gradio>=5.20.1
3
  tqdm>=4.67.1
4
  gradio-modal>=0.0.4
5
  dill>=0.3.9
6
+ pycurl>=7.45.6