Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
@@ -348,21 +348,38 @@ def convert_ppt_to_text(input_filename: str) -> str:
|
|
348 |
for shape in slide.shapes:
|
349 |
if hasattr(shape, "text"):
|
350 |
text_content.append(shape.text)
|
351 |
-
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
352 |
-
image_bytes = shape.image.blob
|
353 |
-
image_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
354 |
-
image_extension = shape.image.filename.split('.')[-1].lower()
|
355 |
-
if image_extension in ['png', 'jpeg', 'jpg', 'gif']:
|
356 |
-
mime_type = f'image/{image_extension}'
|
357 |
-
else:
|
358 |
-
mime_type = 'image/png' # Par défaut si l'extension est inconnue
|
359 |
-
data_uri = f'data:{mime_type};base64,{image_base64}'
|
360 |
-
# Ajouter une balise img avec le data URI ; inclure un alt si disponible
|
361 |
-
alt_text = shape.name # Ou une logique différente pour déterminer l'alt text
|
362 |
-
img_tag = f'<img src="{data_uri}" alt="{alt_text}">'
|
363 |
-
text_content.append(img_tag)
|
364 |
return "\n".join(text_content)
|
365 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
366 |
|
367 |
def convert_doc_to_text(input_filename: str) -> str:
|
368 |
if 'textract' not in globals():
|
@@ -601,12 +618,18 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
|
|
601 |
# Conversion en HTML sans extraire les images à ce stade
|
602 |
if ext == '.pdf':
|
603 |
html_content = pdf_to_html(input_file_path)
|
604 |
-
elif ext
|
|
|
|
|
|
|
605 |
text = convert_ppt_to_text(input_file_path)
|
606 |
html_content = text_to_html(text)
|
|
|
607 |
elif ext == '.doc':
|
608 |
text = convert_doc_to_text(input_file_path)
|
609 |
html_content = text_to_html(text)
|
|
|
|
|
610 |
elif ext in ['.html', '.htm']:
|
611 |
with open(input_file_path, 'r', encoding='utf-8') as f:
|
612 |
raw_html = f.read()
|
@@ -617,9 +640,11 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
|
|
617 |
except Exception as e:
|
618 |
logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
|
619 |
html_content = raw_html
|
|
|
620 |
else:
|
621 |
input_format = get_pandoc_format(ext)
|
622 |
html_content = convert_with_pandoc(input_file_path, input_format)
|
|
|
623 |
|
624 |
if ext == '.docx':
|
625 |
logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
@@ -632,6 +657,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
|
|
632 |
logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html}...")
|
633 |
logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
|
634 |
|
|
|
|
|
|
|
635 |
# Décrire les images
|
636 |
for image_key in images_data:
|
637 |
context = get_context_for_image(cleaned_html, image_key)
|
@@ -682,7 +710,6 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
|
|
682 |
except Exception as e:
|
683 |
logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
|
684 |
update_job_status(job_id, 'error', f"Erreur: {str(e)}")
|
685 |
-
|
686 |
|
687 |
|
688 |
@app.get("/status/{job_id}")
|
@@ -755,7 +782,9 @@ async def convert_file_to_txt(
|
|
755 |
# Conversion en HTML via pandoc si nécessaire
|
756 |
if ext == '.pdf':
|
757 |
html_content = pdf_to_html(input_filename)
|
758 |
-
elif ext
|
|
|
|
|
759 |
text = convert_ppt_to_text(input_filename)
|
760 |
html_content = text_to_html(text)
|
761 |
elif ext == '.doc':
|
@@ -766,7 +795,7 @@ async def convert_file_to_txt(
|
|
766 |
else:
|
767 |
input_format = get_pandoc_format(ext)
|
768 |
html_content = convert_with_pandoc(input_filename, input_format)
|
769 |
-
|
770 |
if ext == '.docx':
|
771 |
logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
772 |
|
|
|
348 |
for shape in slide.shapes:
|
349 |
if hasattr(shape, "text"):
|
350 |
text_content.append(shape.text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
return "\n".join(text_content)
|
352 |
|
353 |
+
def convert_pptx_to_html(input_filename: str) -> str:
|
354 |
+
if 'Presentation' not in globals():
|
355 |
+
raise HTTPException(status_code=500, detail="La librairie python-pptx n'est pas installée.")
|
356 |
+
prs = Presentation(input_filename)
|
357 |
+
html_content = "<html><head></head><body>"
|
358 |
+
slide_number = 1
|
359 |
+
for slide in prs.slides:
|
360 |
+
html_content += f"<!--PAGE_{slide_number}-->"
|
361 |
+
for shape in slide.shapes:
|
362 |
+
if shape.has_text_frame:
|
363 |
+
text_content = shape.text_frame.text
|
364 |
+
# Basic handling for different text levels - can be improved
|
365 |
+
if shape.is_placeholder:
|
366 |
+
if shape.placeholder_format.idx == 0: # Title
|
367 |
+
html_content += f"<h1>{text_content}</h1>"
|
368 |
+
elif shape.placeholder_format.idx == 1: # Subtitle/Content
|
369 |
+
html_content += f"<p>{text_content}</p>"
|
370 |
+
else:
|
371 |
+
html_content += f"<p>{text_content}</p>"
|
372 |
+
else:
|
373 |
+
html_content += f"<p>{text_content}</p>"
|
374 |
+
elif shape.has_image:
|
375 |
+
image = shape.image
|
376 |
+
image_bytes = image.blob
|
377 |
+
base64_encoded = base64.b64encode(image_bytes).decode('utf-8')
|
378 |
+
mime_type = image.content_type
|
379 |
+
html_content += f'<img src="data:{mime_type};base64,{base64_encoded}" alt="Slide Image">'
|
380 |
+
slide_number += 1
|
381 |
+
html_content += "</body></html>"
|
382 |
+
return html_content
|
383 |
|
384 |
def convert_doc_to_text(input_filename: str) -> str:
|
385 |
if 'textract' not in globals():
|
|
|
618 |
# Conversion en HTML sans extraire les images à ce stade
|
619 |
if ext == '.pdf':
|
620 |
html_content = pdf_to_html(input_file_path)
|
621 |
+
elif ext == '.pptx':
|
622 |
+
html_content = convert_pptx_to_html(input_file_path)
|
623 |
+
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
624 |
+
elif ext == '.ppt':
|
625 |
text = convert_ppt_to_text(input_file_path)
|
626 |
html_content = text_to_html(text)
|
627 |
+
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
628 |
elif ext == '.doc':
|
629 |
text = convert_doc_to_text(input_file_path)
|
630 |
html_content = text_to_html(text)
|
631 |
+
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
632 |
+
logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
633 |
elif ext in ['.html', '.htm']:
|
634 |
with open(input_file_path, 'r', encoding='utf-8') as f:
|
635 |
raw_html = f.read()
|
|
|
640 |
except Exception as e:
|
641 |
logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
|
642 |
html_content = raw_html
|
643 |
+
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
644 |
else:
|
645 |
input_format = get_pandoc_format(ext)
|
646 |
html_content = convert_with_pandoc(input_file_path, input_format)
|
647 |
+
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
648 |
|
649 |
if ext == '.docx':
|
650 |
logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
|
|
657 |
logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html}...")
|
658 |
logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
|
659 |
|
660 |
+
if ext == '.docx':
|
661 |
+
logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
|
662 |
+
|
663 |
# Décrire les images
|
664 |
for image_key in images_data:
|
665 |
context = get_context_for_image(cleaned_html, image_key)
|
|
|
710 |
except Exception as e:
|
711 |
logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
|
712 |
update_job_status(job_id, 'error', f"Erreur: {str(e)}")
|
|
|
713 |
|
714 |
|
715 |
@app.get("/status/{job_id}")
|
|
|
782 |
# Conversion en HTML via pandoc si nécessaire
|
783 |
if ext == '.pdf':
|
784 |
html_content = pdf_to_html(input_filename)
|
785 |
+
elif ext == '.pptx':
|
786 |
+
html_content = convert_pptx_to_html(input_filename)
|
787 |
+
elif ext in ['.ppt']:
|
788 |
text = convert_ppt_to_text(input_filename)
|
789 |
html_content = text_to_html(text)
|
790 |
elif ext == '.doc':
|
|
|
795 |
else:
|
796 |
input_format = get_pandoc_format(ext)
|
797 |
html_content = convert_with_pandoc(input_filename, input_format)
|
798 |
+
|
799 |
if ext == '.docx':
|
800 |
logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
801 |
|