Spaces:
Sleeping
Sleeping
extract text with mrkers
Browse files
main.py
CHANGED
@@ -753,6 +753,42 @@ def extract_images_from_pdf(input_filename: str) -> List[bytes]:
|
|
753 |
logging.info(f"Extraction des images terminée. Nombre total d'images extraites : {len(images)}")
|
754 |
return images
|
755 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
756 |
|
757 |
def extract_images_from_ppt(input_filename: str) -> List[bytes]:
|
758 |
images = []
|
|
|
753 |
logging.info(f"Extraction des images terminée. Nombre total d'images extraites : {len(images)}")
|
754 |
return images
|
755 |
|
756 |
+
def extract_text_with_image_markers(input_filename: str) -> Tuple[str, List[Tuple[int, bytes]]]:
|
757 |
+
"""
|
758 |
+
Extrait le texte d'un PDF en insérant des marqueurs pour les images.
|
759 |
+
|
760 |
+
Args:
|
761 |
+
input_filename (str): Chemin vers le fichier PDF.
|
762 |
+
|
763 |
+
Returns:
|
764 |
+
Tuple[str, List[Tuple[int, bytes]]]: Le texte extrait avec des marqueurs et une liste d'images extraites.
|
765 |
+
"""
|
766 |
+
text = ""
|
767 |
+
images = []
|
768 |
+
with fitz.open(input_filename) as doc:
|
769 |
+
for page_num, page in enumerate(doc, start=1):
|
770 |
+
page_json = page.get_text("json")
|
771 |
+
blocks = page_json["blocks"]
|
772 |
+
for block in blocks:
|
773 |
+
if block['type'] == 0: # Texte
|
774 |
+
for line in block.get('lines', []):
|
775 |
+
for span in line.get('spans', []):
|
776 |
+
text += span.get('text', '')
|
777 |
+
elif block['type'] == 1: # Image
|
778 |
+
# Insérer un marqueur unique pour l'image
|
779 |
+
img_num = len(images) + 1
|
780 |
+
marker = f"[IMG_{img_num}]"
|
781 |
+
text += marker
|
782 |
+
# Extraire l'image
|
783 |
+
xref = block.get('xref')
|
784 |
+
try:
|
785 |
+
base_image = doc.extract_image(xref)
|
786 |
+
image_bytes = base_image["image"]
|
787 |
+
images.append((img_num, image_bytes))
|
788 |
+
except Exception as e:
|
789 |
+
logging.error(f"Erreur lors de l'extraction de l'image xref={xref} : {str(e)}")
|
790 |
+
return text, images
|
791 |
+
|
792 |
|
793 |
def extract_images_from_ppt(input_filename: str) -> List[bytes]:
|
794 |
images = []
|