Bentham commited on
Commit
eb2e4f8
·
verified ·
1 Parent(s): 92bfb62

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +46 -17
main.py CHANGED
@@ -348,21 +348,38 @@ def convert_ppt_to_text(input_filename: str) -> str:
348
  for shape in slide.shapes:
349
  if hasattr(shape, "text"):
350
  text_content.append(shape.text)
351
- if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
352
- image_bytes = shape.image.blob
353
- image_base64 = base64.b64encode(image_bytes).decode('utf-8')
354
- image_extension = shape.image.filename.split('.')[-1].lower()
355
- if image_extension in ['png', 'jpeg', 'jpg', 'gif']:
356
- mime_type = f'image/{image_extension}'
357
- else:
358
- mime_type = 'image/png' # Par défaut si l'extension est inconnue
359
- data_uri = f'data:{mime_type};base64,{image_base64}'
360
- # Ajouter une balise img avec le data URI ; inclure un alt si disponible
361
- alt_text = shape.name # Ou une logique différente pour déterminer l'alt text
362
- img_tag = f'<img src="{data_uri}" alt="{alt_text}">'
363
- text_content.append(img_tag)
364
  return "\n".join(text_content)
365
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
  def convert_doc_to_text(input_filename: str) -> str:
368
  if 'textract' not in globals():
@@ -601,12 +618,18 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
601
  # Conversion en HTML sans extraire les images à ce stade
602
  if ext == '.pdf':
603
  html_content = pdf_to_html(input_file_path)
604
- elif ext in ['.ppt', '.pptx']:
 
 
 
605
  text = convert_ppt_to_text(input_file_path)
606
  html_content = text_to_html(text)
 
607
  elif ext == '.doc':
608
  text = convert_doc_to_text(input_file_path)
609
  html_content = text_to_html(text)
 
 
610
  elif ext in ['.html', '.htm']:
611
  with open(input_file_path, 'r', encoding='utf-8') as f:
612
  raw_html = f.read()
@@ -617,9 +640,11 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
617
  except Exception as e:
618
  logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
619
  html_content = raw_html
 
620
  else:
621
  input_format = get_pandoc_format(ext)
622
  html_content = convert_with_pandoc(input_file_path, input_format)
 
623
 
624
  if ext == '.docx':
625
  logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
@@ -632,6 +657,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
632
  logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html}...")
633
  logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
634
 
 
 
 
635
  # Décrire les images
636
  for image_key in images_data:
637
  context = get_context_for_image(cleaned_html, image_key)
@@ -682,7 +710,6 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
682
  except Exception as e:
683
  logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
684
  update_job_status(job_id, 'error', f"Erreur: {str(e)}")
685
-
686
 
687
 
688
  @app.get("/status/{job_id}")
@@ -755,7 +782,9 @@ async def convert_file_to_txt(
755
  # Conversion en HTML via pandoc si nécessaire
756
  if ext == '.pdf':
757
  html_content = pdf_to_html(input_filename)
758
- elif ext in ['.ppt', '.pptx']:
 
 
759
  text = convert_ppt_to_text(input_filename)
760
  html_content = text_to_html(text)
761
  elif ext == '.doc':
@@ -766,7 +795,7 @@ async def convert_file_to_txt(
766
  else:
767
  input_format = get_pandoc_format(ext)
768
  html_content = convert_with_pandoc(input_filename, input_format)
769
-
770
  if ext == '.docx':
771
  logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
772
 
 
348
  for shape in slide.shapes:
349
  if hasattr(shape, "text"):
350
  text_content.append(shape.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  return "\n".join(text_content)
352
 
353
+ def convert_pptx_to_html(input_filename: str) -> str:
354
+ if 'Presentation' not in globals():
355
+ raise HTTPException(status_code=500, detail="La librairie python-pptx n'est pas installée.")
356
+ prs = Presentation(input_filename)
357
+ html_content = "<html><head></head><body>"
358
+ slide_number = 1
359
+ for slide in prs.slides:
360
+ html_content += f"<!--PAGE_{slide_number}-->"
361
+ for shape in slide.shapes:
362
+ if shape.has_text_frame:
363
+ text_content = shape.text_frame.text
364
+ # Basic handling for different text levels - can be improved
365
+ if shape.is_placeholder:
366
+ if shape.placeholder_format.idx == 0: # Title
367
+ html_content += f"<h1>{text_content}</h1>"
368
+ elif shape.placeholder_format.idx == 1: # Subtitle/Content
369
+ html_content += f"<p>{text_content}</p>"
370
+ else:
371
+ html_content += f"<p>{text_content}</p>"
372
+ else:
373
+ html_content += f"<p>{text_content}</p>"
374
+ elif shape.has_image:
375
+ image = shape.image
376
+ image_bytes = image.blob
377
+ base64_encoded = base64.b64encode(image_bytes).decode('utf-8')
378
+ mime_type = image.content_type
379
+ html_content += f'<img src="data:{mime_type};base64,{base64_encoded}" alt="Slide Image">'
380
+ slide_number += 1
381
+ html_content += "</body></html>"
382
+ return html_content
383
 
384
  def convert_doc_to_text(input_filename: str) -> str:
385
  if 'textract' not in globals():
 
618
  # Conversion en HTML sans extraire les images à ce stade
619
  if ext == '.pdf':
620
  html_content = pdf_to_html(input_file_path)
621
+ elif ext == '.pptx':
622
+ html_content = convert_pptx_to_html(input_file_path)
623
+ html_content = insert_page_comments_every_15_paragraphs(html_content)
624
+ elif ext == '.ppt':
625
  text = convert_ppt_to_text(input_file_path)
626
  html_content = text_to_html(text)
627
+ html_content = insert_page_comments_every_15_paragraphs(html_content)
628
  elif ext == '.doc':
629
  text = convert_doc_to_text(input_file_path)
630
  html_content = text_to_html(text)
631
+ html_content = insert_page_comments_every_15_paragraphs(html_content)
632
+ logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
633
  elif ext in ['.html', '.htm']:
634
  with open(input_file_path, 'r', encoding='utf-8') as f:
635
  raw_html = f.read()
 
640
  except Exception as e:
641
  logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
642
  html_content = raw_html
643
+ html_content = insert_page_comments_every_15_paragraphs(html_content)
644
  else:
645
  input_format = get_pandoc_format(ext)
646
  html_content = convert_with_pandoc(input_file_path, input_format)
647
+ html_content = insert_page_comments_every_15_paragraphs(html_content)
648
 
649
  if ext == '.docx':
650
  logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
 
657
  logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html}...")
658
  logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
659
 
660
+ if ext == '.docx':
661
+ logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
662
+
663
  # Décrire les images
664
  for image_key in images_data:
665
  context = get_context_for_image(cleaned_html, image_key)
 
710
  except Exception as e:
711
  logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
712
  update_job_status(job_id, 'error', f"Erreur: {str(e)}")
 
713
 
714
 
715
  @app.get("/status/{job_id}")
 
782
  # Conversion en HTML via pandoc si nécessaire
783
  if ext == '.pdf':
784
  html_content = pdf_to_html(input_filename)
785
+ elif ext == '.pptx':
786
+ html_content = convert_pptx_to_html(input_filename)
787
+ elif ext in ['.ppt']:
788
  text = convert_ppt_to_text(input_filename)
789
  html_content = text_to_html(text)
790
  elif ext == '.doc':
 
795
  else:
796
  input_format = get_pandoc_format(ext)
797
  html_content = convert_with_pandoc(input_filename, input_format)
798
+
799
  if ext == '.docx':
800
  logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
801