Spaces:
Sleeping
Sleeping
extract text with image markers json
Browse files
main.py
CHANGED
@@ -767,7 +767,13 @@ def extract_text_with_image_markers(input_filename: str) -> Tuple[str, List[Tupl
|
|
767 |
with fitz.open(input_filename) as doc:
|
768 |
for page_num, page in enumerate(doc, start=1):
|
769 |
page_json = page.get_text("json")
|
770 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
771 |
for block in blocks:
|
772 |
if block['type'] == 0: # Texte
|
773 |
for line in block.get('lines', []):
|
@@ -780,12 +786,13 @@ def extract_text_with_image_markers(input_filename: str) -> Tuple[str, List[Tupl
|
|
780 |
text += marker
|
781 |
# Extraire l'image
|
782 |
xref = block.get('xref')
|
783 |
-
|
784 |
-
|
785 |
-
|
786 |
-
|
787 |
-
|
788 |
-
|
|
|
789 |
return text, images
|
790 |
|
791 |
|
|
|
767 |
with fitz.open(input_filename) as doc:
|
768 |
for page_num, page in enumerate(doc, start=1):
|
769 |
page_json = page.get_text("json")
|
770 |
+
try:
|
771 |
+
page_data = json.loads(page_json) # Parse le JSON en dict
|
772 |
+
blocks = page_data["blocks"]
|
773 |
+
except json.JSONDecodeError as e:
|
774 |
+
logging.error(f"Erreur de décodage JSON sur la page {page_num}: {str(e)}")
|
775 |
+
continue # Passe à la page suivante en cas d'erreur
|
776 |
+
|
777 |
for block in blocks:
|
778 |
if block['type'] == 0: # Texte
|
779 |
for line in block.get('lines', []):
|
|
|
786 |
text += marker
|
787 |
# Extraire l'image
|
788 |
xref = block.get('xref')
|
789 |
+
if xref is not None:
|
790 |
+
try:
|
791 |
+
base_image = doc.extract_image(xref)
|
792 |
+
image_bytes = base_image["image"]
|
793 |
+
images.append((img_num, image_bytes))
|
794 |
+
except Exception as e:
|
795 |
+
logging.error(f"Erreur lors de l'extraction de l'image xref={xref} sur la page {page_num} : {str(e)}")
|
796 |
return text, images
|
797 |
|
798 |
|