convert_test

Sleeping

App Files Files Community

Bentham commited on Dec 20, 2024

Commit

f3ad28d

verified ·

1 Parent(s): 352dced

Update main.py

Browse files

Files changed (1) hide show

main.py +20 -0

main.py CHANGED Viewed

@@ -348,6 +348,20 @@ def convert_ppt_to_text(input_filename: str) -> str:
         for shape in slide.shapes:
             if hasattr(shape, "text"):
                 text_content.append(shape.text)
     return "\n".join(text_content)
 def convert_doc_to_text(input_filename: str) -> str:
@@ -596,6 +610,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
             html_content = text_to_html(text)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
             logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
         elif ext in ['.html', '.htm']:
             with open(input_file_path, 'r', encoding='utf-8') as f:
                 raw_html = f.read()
@@ -626,6 +643,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
         if ext == '.docx':
             logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
         # Décrire les images
         for image_key in images_data:
             context = get_context_for_image(cleaned_html, image_key)

         for shape in slide.shapes:
             if hasattr(shape, "text"):
                 text_content.append(shape.text)
+            if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
+                image_bytes = shape.image.blob
+                image_base64 = base64.b64encode(image_bytes).decode('utf-8')
+                image_extension = shape.image.filename.split('.')[-1].lower()
+                if image_extension in ['png', 'jpeg', 'jpg', 'gif']:
+                    mime_type = f'image/{image_extension}'
+                else:
+                    mime_type = 'image/png'  # Default if extension is unknown
+                data_uri = f'data:{mime_type};base64,{image_base64}'
+                # Add an img tag with the data URI; consider adding alt text if available
+                alt_text = shape.name  # Or some other logic to determine alt text
+                img_tag = f'<img src="{data_uri}" alt="{alt_text}">'
+                text_content.append(img_tag)
+                text_content.append(shape.text)
     return "\n".join(text_content)
 def convert_doc_to_text(input_filename: str) -> str:
             html_content = text_to_html(text)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
             logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
+    elif ext in ['.html', '.htm']:
+        with open(input_file_path, 'r', encoding='utf-8') as f:
+            raw_html = f.read()
         elif ext in ['.html', '.htm']:
             with open(input_file_path, 'r', encoding='utf-8') as f:
                 raw_html = f.read()
         if ext == '.docx':
             logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
+        image_counter = [1]
+        images_data = {}
         # Décrire les images
         for image_key in images_data:
             context = get_context_for_image(cleaned_html, image_key)