convert_test

Sleeping

App Files Files Community

Bentham commited on Dec 20, 2024

Commit

eb2e4f8

verified ·

1 Parent(s): 92bfb62

Update main.py

Browse files

Files changed (1) hide show

main.py +46 -17

main.py CHANGED Viewed

@@ -348,21 +348,38 @@ def convert_ppt_to_text(input_filename: str) -> str:
         for shape in slide.shapes:
             if hasattr(shape, "text"):
                 text_content.append(shape.text)
-            if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
-                image_bytes = shape.image.blob
-                image_base64 = base64.b64encode(image_bytes).decode('utf-8')
-                image_extension = shape.image.filename.split('.')[-1].lower()
-                if image_extension in ['png', 'jpeg', 'jpg', 'gif']:
-                    mime_type = f'image/{image_extension}'
-                else:
-                    mime_type = 'image/png'  # Par défaut si l'extension est inconnue
-                data_uri = f'data:{mime_type};base64,{image_base64}'
-                # Ajouter une balise img avec le data URI ; inclure un alt si disponible
-                alt_text = shape.name  # Ou une logique différente pour déterminer l'alt text
-                img_tag = f'<img src="{data_uri}" alt="{alt_text}">'
-                text_content.append(img_tag)
     return "\n".join(text_content)
 def convert_doc_to_text(input_filename: str) -> str:
     if 'textract' not in globals():
@@ -601,12 +618,18 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
         # Conversion en HTML sans extraire les images à ce stade
         if ext == '.pdf':
             html_content = pdf_to_html(input_file_path)
-        elif ext in ['.ppt', '.pptx']:
             text = convert_ppt_to_text(input_file_path)
             html_content = text_to_html(text)
         elif ext == '.doc':
             text = convert_doc_to_text(input_file_path)
             html_content = text_to_html(text)
         elif ext in ['.html', '.htm']:
             with open(input_file_path, 'r', encoding='utf-8') as f:
                 raw_html = f.read()
@@ -617,9 +640,11 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
             except Exception as e:
                 logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
                 html_content = raw_html
         else:
             input_format = get_pandoc_format(ext)
             html_content = convert_with_pandoc(input_file_path, input_format)
         if ext == '.docx':
             logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
@@ -632,6 +657,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
         logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html}...")
         logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
         # Décrire les images
         for image_key in images_data:
             context = get_context_for_image(cleaned_html, image_key)
@@ -682,7 +710,6 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
     except Exception as e:
         logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
         update_job_status(job_id, 'error', f"Erreur: {str(e)}")
 @app.get("/status/{job_id}")
@@ -755,7 +782,9 @@ async def convert_file_to_txt(
         # Conversion en HTML via pandoc si nécessaire
         if ext == '.pdf':
             html_content = pdf_to_html(input_filename)
-        elif ext in ['.ppt', '.pptx']:
             text = convert_ppt_to_text(input_filename)
             html_content = text_to_html(text)
         elif ext == '.doc':
@@ -766,7 +795,7 @@ async def convert_file_to_txt(
         else:
             input_format = get_pandoc_format(ext)
             html_content = convert_with_pandoc(input_filename, input_format)
         if ext == '.docx':
             logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")

         for shape in slide.shapes:
             if hasattr(shape, "text"):
                 text_content.append(shape.text)
     return "\n".join(text_content)
+def convert_pptx_to_html(input_filename: str) -> str:
+    if 'Presentation' not in globals():
+        raise HTTPException(status_code=500, detail="La librairie python-pptx n'est pas installée.")
+    prs = Presentation(input_filename)
+    html_content = "<html><head></head><body>"
+    slide_number = 1
+    for slide in prs.slides:
+        html_content += f"<!--PAGE_{slide_number}-->"
+        for shape in slide.shapes:
+            if shape.has_text_frame:
+                text_content = shape.text_frame.text
+                # Basic handling for different text levels - can be improved
+                if shape.is_placeholder:
+                    if shape.placeholder_format.idx == 0:  # Title
+                        html_content += f"<h1>{text_content}</h1>"
+                    elif shape.placeholder_format.idx == 1:  # Subtitle/Content
+                        html_content += f"<p>{text_content}</p>"
+                    else:
+                        html_content += f"<p>{text_content}</p>"
+                else:
+                    html_content += f"<p>{text_content}</p>"
+            elif shape.has_image:
+                image = shape.image
+                image_bytes = image.blob
+                base64_encoded = base64.b64encode(image_bytes).decode('utf-8')
+                mime_type = image.content_type
+                html_content += f'<img src="data:{mime_type};base64,{base64_encoded}" alt="Slide Image">'
+        slide_number += 1
+    html_content += "</body></html>"
+    return html_content
 def convert_doc_to_text(input_filename: str) -> str:
     if 'textract' not in globals():
         # Conversion en HTML sans extraire les images à ce stade
         if ext == '.pdf':
             html_content = pdf_to_html(input_file_path)
+        elif ext == '.pptx':
+            html_content = convert_pptx_to_html(input_file_path)
+            html_content = insert_page_comments_every_15_paragraphs(html_content)
+        elif ext == '.ppt':
             text = convert_ppt_to_text(input_file_path)
             html_content = text_to_html(text)
+            html_content = insert_page_comments_every_15_paragraphs(html_content)
         elif ext == '.doc':
             text = convert_doc_to_text(input_file_path)
             html_content = text_to_html(text)
+            html_content = insert_page_comments_every_15_paragraphs(html_content)
+            logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
         elif ext in ['.html', '.htm']:
             with open(input_file_path, 'r', encoding='utf-8') as f:
                 raw_html = f.read()
             except Exception as e:
                 logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
                 html_content = raw_html
+            html_content = insert_page_comments_every_15_paragraphs(html_content)
         else:
             input_format = get_pandoc_format(ext)
             html_content = convert_with_pandoc(input_file_path, input_format)
+            html_content = insert_page_comments_every_15_paragraphs(html_content)
         if ext == '.docx':
             logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
         logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html}...")
         logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
+        if ext == '.docx':
+            logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
         # Décrire les images
         for image_key in images_data:
             context = get_context_for_image(cleaned_html, image_key)
     except Exception as e:
         logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
         update_job_status(job_id, 'error', f"Erreur: {str(e)}")
 @app.get("/status/{job_id}")
         # Conversion en HTML via pandoc si nécessaire
         if ext == '.pdf':
             html_content = pdf_to_html(input_filename)
+        elif ext == '.pptx':
+            html_content = convert_pptx_to_html(input_filename)
+        elif ext in ['.ppt']:
             text = convert_ppt_to_text(input_filename)
             html_content = text_to_html(text)
         elif ext == '.doc':
         else:
             input_format = get_pandoc_format(ext)
             html_content = convert_with_pandoc(input_filename, input_format)
         if ext == '.docx':
             logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")