Bentham commited on
Commit
f3ad28d
·
verified ·
1 Parent(s): 352dced

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +20 -0
main.py CHANGED
@@ -348,6 +348,20 @@ def convert_ppt_to_text(input_filename: str) -> str:
348
  for shape in slide.shapes:
349
  if hasattr(shape, "text"):
350
  text_content.append(shape.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  return "\n".join(text_content)
352
 
353
  def convert_doc_to_text(input_filename: str) -> str:
@@ -596,6 +610,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
596
  html_content = text_to_html(text)
597
  html_content = insert_page_comments_every_15_paragraphs(html_content)
598
  logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
 
 
 
599
  elif ext in ['.html', '.htm']:
600
  with open(input_file_path, 'r', encoding='utf-8') as f:
601
  raw_html = f.read()
@@ -626,6 +643,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
626
  if ext == '.docx':
627
  logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
628
 
 
 
 
629
  # Décrire les images
630
  for image_key in images_data:
631
  context = get_context_for_image(cleaned_html, image_key)
 
348
  for shape in slide.shapes:
349
  if hasattr(shape, "text"):
350
  text_content.append(shape.text)
351
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
352
+ image_bytes = shape.image.blob
353
+ image_base64 = base64.b64encode(image_bytes).decode('utf-8')
354
+ image_extension = shape.image.filename.split('.')[-1].lower()
355
+ if image_extension in ['png', 'jpeg', 'jpg', 'gif']:
356
+ mime_type = f'image/{image_extension}'
357
+ else:
358
+ mime_type = 'image/png' # Default if extension is unknown
359
+ data_uri = f'data:{mime_type};base64,{image_base64}'
360
+ # Add an img tag with the data URI; consider adding alt text if available
361
+ alt_text = shape.name # Or some other logic to determine alt text
362
+ img_tag = f'<img src="{data_uri}" alt="{alt_text}">'
363
+ text_content.append(img_tag)
364
+ text_content.append(shape.text)
365
  return "\n".join(text_content)
366
 
367
  def convert_doc_to_text(input_filename: str) -> str:
 
610
  html_content = text_to_html(text)
611
  html_content = insert_page_comments_every_15_paragraphs(html_content)
612
  logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
613
+ elif ext in ['.html', '.htm']:
614
+ with open(input_file_path, 'r', encoding='utf-8') as f:
615
+ raw_html = f.read()
616
  elif ext in ['.html', '.htm']:
617
  with open(input_file_path, 'r', encoding='utf-8') as f:
618
  raw_html = f.read()
 
643
  if ext == '.docx':
644
  logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
645
 
646
+ image_counter = [1]
647
+ images_data = {}
648
+
649
  # Décrire les images
650
  for image_key in images_data:
651
  context = get_context_for_image(cleaned_html, image_key)