Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
@@ -348,6 +348,20 @@ def convert_ppt_to_text(input_filename: str) -> str:
|
|
348 |
for shape in slide.shapes:
|
349 |
if hasattr(shape, "text"):
|
350 |
text_content.append(shape.text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
return "\n".join(text_content)
|
352 |
|
353 |
def convert_doc_to_text(input_filename: str) -> str:
|
@@ -596,6 +610,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
|
|
596 |
html_content = text_to_html(text)
|
597 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
598 |
logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
|
|
|
|
|
|
599 |
elif ext in ['.html', '.htm']:
|
600 |
with open(input_file_path, 'r', encoding='utf-8') as f:
|
601 |
raw_html = f.read()
|
@@ -626,6 +643,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
|
|
626 |
if ext == '.docx':
|
627 |
logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
|
628 |
|
|
|
|
|
|
|
629 |
# Décrire les images
|
630 |
for image_key in images_data:
|
631 |
context = get_context_for_image(cleaned_html, image_key)
|
|
|
348 |
for shape in slide.shapes:
|
349 |
if hasattr(shape, "text"):
|
350 |
text_content.append(shape.text)
|
351 |
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
352 |
+
image_bytes = shape.image.blob
|
353 |
+
image_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
354 |
+
image_extension = shape.image.filename.split('.')[-1].lower()
|
355 |
+
if image_extension in ['png', 'jpeg', 'jpg', 'gif']:
|
356 |
+
mime_type = f'image/{image_extension}'
|
357 |
+
else:
|
358 |
+
mime_type = 'image/png' # Default if extension is unknown
|
359 |
+
data_uri = f'data:{mime_type};base64,{image_base64}'
|
360 |
+
# Add an img tag with the data URI; consider adding alt text if available
|
361 |
+
alt_text = shape.name # Or some other logic to determine alt text
|
362 |
+
img_tag = f'<img src="{data_uri}" alt="{alt_text}">'
|
363 |
+
text_content.append(img_tag)
|
364 |
+
text_content.append(shape.text)
|
365 |
return "\n".join(text_content)
|
366 |
|
367 |
def convert_doc_to_text(input_filename: str) -> str:
|
|
|
610 |
html_content = text_to_html(text)
|
611 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
612 |
logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
613 |
+
elif ext in ['.html', '.htm']:
|
614 |
+
with open(input_file_path, 'r', encoding='utf-8') as f:
|
615 |
+
raw_html = f.read()
|
616 |
elif ext in ['.html', '.htm']:
|
617 |
with open(input_file_path, 'r', encoding='utf-8') as f:
|
618 |
raw_html = f.read()
|
|
|
643 |
if ext == '.docx':
|
644 |
logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
|
645 |
|
646 |
+
image_counter = [1]
|
647 |
+
images_data = {}
|
648 |
+
|
649 |
# Décrire les images
|
650 |
for image_key in images_data:
|
651 |
context = get_context_for_image(cleaned_html, image_key)
|