accessibility

Sleeping

Bentham commited on Dec 19, 2024

Commit

ca7514d

verified ·

1 Parent(s): 68f361a

extract text with image markers json

Files changed (1) hide show

main.py CHANGED Viewed

@@ -767,7 +767,13 @@ def extract_text_with_image_markers(input_filename: str) -> Tuple[str, List[Tupl
     with fitz.open(input_filename) as doc:
         for page_num, page in enumerate(doc, start=1):
             page_json = page.get_text("json")
-            blocks = page_json["blocks"]
             for block in blocks:
                 if block['type'] == 0:  # Texte
                     for line in block.get('lines', []):
@@ -780,12 +786,13 @@ def extract_text_with_image_markers(input_filename: str) -> Tuple[str, List[Tupl
                     text += marker
                     # Extraire l'image
                     xref = block.get('xref')
-                    try:
-                        base_image = doc.extract_image(xref)
-                        image_bytes = base_image["image"]
-                        images.append((img_num, image_bytes))
-                    except Exception as e:
-                        logging.error(f"Erreur lors de l'extraction de l'image xref={xref} : {str(e)}")
     return text, images

     with fitz.open(input_filename) as doc:
         for page_num, page in enumerate(doc, start=1):
             page_json = page.get_text("json")
+            try:
+                page_data = json.loads(page_json)  # Parse le JSON en dict
+                blocks = page_data["blocks"]
+            except json.JSONDecodeError as e:
+                logging.error(f"Erreur de décodage JSON sur la page {page_num}: {str(e)}")
+                continue  # Passe à la page suivante en cas d'erreur
             for block in blocks:
                 if block['type'] == 0:  # Texte
                     for line in block.get('lines', []):
                     text += marker
                     # Extraire l'image
                     xref = block.get('xref')
+                    if xref is not None:
+                        try:
+                            base_image = doc.extract_image(xref)
+                            image_bytes = base_image["image"]
+                            images.append((img_num, image_bytes))
+                        except Exception as e:
+                            logging.error(f"Erreur lors de l'extraction de l'image xref={xref} sur la page {page_num} : {str(e)}")
     return text, images