Bentham commited on
Commit
ca7514d
·
verified ·
1 Parent(s): 68f361a

extract text with image markers json

Browse files
Files changed (1) hide show
  1. main.py +14 -7
main.py CHANGED
@@ -767,7 +767,13 @@ def extract_text_with_image_markers(input_filename: str) -> Tuple[str, List[Tupl
767
  with fitz.open(input_filename) as doc:
768
  for page_num, page in enumerate(doc, start=1):
769
  page_json = page.get_text("json")
770
- blocks = page_json["blocks"]
 
 
 
 
 
 
771
  for block in blocks:
772
  if block['type'] == 0: # Texte
773
  for line in block.get('lines', []):
@@ -780,12 +786,13 @@ def extract_text_with_image_markers(input_filename: str) -> Tuple[str, List[Tupl
780
  text += marker
781
  # Extraire l'image
782
  xref = block.get('xref')
783
- try:
784
- base_image = doc.extract_image(xref)
785
- image_bytes = base_image["image"]
786
- images.append((img_num, image_bytes))
787
- except Exception as e:
788
- logging.error(f"Erreur lors de l'extraction de l'image xref={xref} : {str(e)}")
 
789
  return text, images
790
 
791
 
 
767
  with fitz.open(input_filename) as doc:
768
  for page_num, page in enumerate(doc, start=1):
769
  page_json = page.get_text("json")
770
+ try:
771
+ page_data = json.loads(page_json) # Parse le JSON en dict
772
+ blocks = page_data["blocks"]
773
+ except json.JSONDecodeError as e:
774
+ logging.error(f"Erreur de décodage JSON sur la page {page_num}: {str(e)}")
775
+ continue # Passe à la page suivante en cas d'erreur
776
+
777
  for block in blocks:
778
  if block['type'] == 0: # Texte
779
  for line in block.get('lines', []):
 
786
  text += marker
787
  # Extraire l'image
788
  xref = block.get('xref')
789
+ if xref is not None:
790
+ try:
791
+ base_image = doc.extract_image(xref)
792
+ image_bytes = base_image["image"]
793
+ images.append((img_num, image_bytes))
794
+ except Exception as e:
795
+ logging.error(f"Erreur lors de l'extraction de l'image xref={xref} sur la page {page_num} : {str(e)}")
796
  return text, images
797
 
798