Bentham commited on
Commit
881bf2e
·
verified ·
1 Parent(s): f3ad28d

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +95 -108
main.py CHANGED
@@ -355,15 +355,15 @@ def convert_ppt_to_text(input_filename: str) -> str:
355
  if image_extension in ['png', 'jpeg', 'jpg', 'gif']:
356
  mime_type = f'image/{image_extension}'
357
  else:
358
- mime_type = 'image/png' # Default if extension is unknown
359
  data_uri = f'data:{mime_type};base64,{image_base64}'
360
- # Add an img tag with the data URI; consider adding alt text if available
361
- alt_text = shape.name # Or some other logic to determine alt text
362
  img_tag = f'<img src="{data_uri}" alt="{alt_text}">'
363
  text_content.append(img_tag)
364
- text_content.append(shape.text)
365
  return "\n".join(text_content)
366
 
 
367
  def convert_doc_to_text(input_filename: str) -> str:
368
  if 'textract' not in globals():
369
  raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
@@ -592,110 +592,97 @@ async def convert_file_to_html(
592
  logging.error(f"Erreur lors du démarrage du job : {str(e)}")
593
  return JSONResponse(status_code=500, content={"message": f"Erreur lors du démarrage du job : {str(e)}"})
594
 
595
- async def process_file_accessibility(job_id: str, input_file_path: str, ext: str, original_filename: str):
596
- job_dir = os.path.join(JOBS_DIR, job_id)
597
- try:
598
- update_job_status(job_id, 'processing', 'Le fichier est en cours de traitement')
599
- base_filename = os.path.splitext(original_filename)[0]
600
-
601
- # Conversion en HTML sans extraire les images à ce stade
602
- if ext == '.pdf':
603
- html_content = pdf_to_html(input_file_path)
604
- elif ext in ['.ppt', '.pptx']:
605
- text = convert_ppt_to_text(input_file_path)
606
- html_content = text_to_html(text)
607
- html_content = insert_page_comments_every_15_paragraphs(html_content)
608
- elif ext == '.doc':
609
- text = convert_doc_to_text(input_file_path)
610
- html_content = text_to_html(text)
611
- html_content = insert_page_comments_every_15_paragraphs(html_content)
612
- logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
613
- elif ext in ['.html', '.htm']:
614
- with open(input_file_path, 'r', encoding='utf-8') as f:
615
- raw_html = f.read()
616
- elif ext in ['.html', '.htm']:
617
- with open(input_file_path, 'r', encoding='utf-8') as f:
618
- raw_html = f.read()
619
- try:
620
- doc = Document(raw_html)
621
- cleaned = doc.summary()
622
- html_content = cleaned
623
- except Exception as e:
624
- logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
625
- html_content = raw_html
626
- html_content = insert_page_comments_every_15_paragraphs(html_content)
627
- else:
628
- input_format = get_pandoc_format(ext)
629
- html_content = convert_with_pandoc(input_file_path, input_format)
630
- html_content = insert_page_comments_every_15_paragraphs(html_content)
631
-
632
- if ext == '.docx':
633
- logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
634
-
635
- # Nettoyage et extraction des images
636
- image_counter = [1]
637
- images_data = {}
638
- logging.debug(f"DEBUG ACCESSIBILITY: HTML avant clean_html_content : {html_content[:500]}...")
639
- cleaned_html = await clean_html_content(html_content, image_counter, images_data)
640
- logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html}...")
641
- logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
642
-
643
- if ext == '.docx':
644
- logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
645
-
646
- image_counter = [1]
647
- images_data = {}
648
-
649
- # Décrire les images
650
- for image_key in images_data:
651
- context = get_context_for_image(cleaned_html, image_key)
652
- prompt = (
653
- "Décris ce que l'on peut voir sur cette image, pour qu'un lecteur malvoyant puisse comprendre ce qu'elle représente."
654
- "\nJe vais maintenant te donner les dernières phrases qui précèdent cette image. "
655
- "Prends en compte ce contexte pour l'interpréter :\n\"" + context + "\""
656
- )
657
- images_data[image_key]['prompt'] = prompt
658
-
659
- tasks = []
660
- for image_key in images_data:
661
- base64_image = images_data[image_key]['base64_image']
662
- prompt = images_data[image_key]['prompt']
663
- tasks.append((image_key, asyncio.create_task(get_image_description(base64_image, prompt))))
664
-
665
- results = await asyncio.gather(*(t[1] for t in tasks))
666
- for (image_key, _), description in zip(tasks, results):
667
- images_data[image_key]['description'] = description
668
-
669
- # Réécriture accessible
670
- rewritten_html = await rewrite_html_accessible(cleaned_html)
671
-
672
- # Réinsertion des images
673
- final_html = reinsert_images(rewritten_html, images_data)
674
-
675
- # Nettoyage final
676
- final_soup = BeautifulSoup(final_html, 'html.parser')
677
- scripts_to_remove = final_soup.find_all('script', src=True)
678
- for script in scripts_to_remove:
679
- if script['src'].startswith('https://bentham-converttohtml.hf.space/'):
680
- script.decompose()
681
- final_html = str(final_soup)
682
-
683
- # Supprimer lignes contenant ```
684
- final_html = re.sub(r'^\s*```(?:html)?\s*$', '', final_html, flags=re.MULTILINE)
685
-
686
- # Insérer le CSS
687
- final_html = insert_css_into_html(final_html)
688
-
689
- output_filename = os.path.join(job_dir, f"{base_filename}.html")
690
- with open(output_filename, 'w', encoding='utf-8') as f:
691
- f.write(final_html)
692
-
693
- update_job_status(job_id, 'completed', 'Traitement terminé', result_file=f"{base_filename}.html")
694
- delete_files_after_delay([input_file_path], delay=6000)
695
-
696
- except Exception as e:
697
- logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
698
- update_job_status(job_id, 'error', f"Erreur: {str(e)}")
699
 
700
 
701
  @app.get("/status/{job_id}")
 
355
  if image_extension in ['png', 'jpeg', 'jpg', 'gif']:
356
  mime_type = f'image/{image_extension}'
357
  else:
358
+ mime_type = 'image/png' # Par défaut si l'extension est inconnue
359
  data_uri = f'data:{mime_type};base64,{image_base64}'
360
+ # Ajouter une balise img avec le data URI ; inclure un alt si disponible
361
+ alt_text = shape.name # Ou une logique différente pour déterminer l'alt text
362
  img_tag = f'<img src="{data_uri}" alt="{alt_text}">'
363
  text_content.append(img_tag)
 
364
  return "\n".join(text_content)
365
 
366
+
367
  def convert_doc_to_text(input_filename: str) -> str:
368
  if 'textract' not in globals():
369
  raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
 
592
  logging.error(f"Erreur lors du démarrage du job : {str(e)}")
593
  return JSONResponse(status_code=500, content={"message": f"Erreur lors du démarrage du job : {str(e)}"})
594
 
595
+ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str, original_filename: str):
596
+ job_dir = os.path.join(JOBS_DIR, job_id)
597
+ try:
598
+ update_job_status(job_id, 'processing', 'Le fichier est en cours de traitement')
599
+ base_filename = os.path.splitext(original_filename)[0]
600
+
601
+ # Conversion en HTML sans extraire les images à ce stade
602
+ if ext == '.pdf':
603
+ html_content = pdf_to_html(input_file_path)
604
+ elif ext in ['.ppt', '.pptx']:
605
+ text = convert_ppt_to_text(input_file_path)
606
+ html_content = text_to_html(text)
607
+ elif ext == '.doc':
608
+ text = convert_doc_to_text(input_file_path)
609
+ html_content = text_to_html(text)
610
+ elif ext in ['.html', '.htm']:
611
+ with open(input_file_path, 'r', encoding='utf-8') as f:
612
+ raw_html = f.read()
613
+ try:
614
+ doc = Document(raw_html)
615
+ cleaned = doc.summary()
616
+ html_content = cleaned
617
+ except Exception as e:
618
+ logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
619
+ html_content = raw_html
620
+ else:
621
+ input_format = get_pandoc_format(ext)
622
+ html_content = convert_with_pandoc(input_file_path, input_format)
623
+
624
+ if ext == '.docx':
625
+ logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
626
+
627
+ # Nettoyage et extraction des images
628
+ image_counter = [1]
629
+ images_data = {}
630
+ logging.debug(f"DEBUG ACCESSIBILITY: HTML avant clean_html_content : {html_content[:500]}...")
631
+ cleaned_html = await clean_html_content(html_content, image_counter, images_data)
632
+ logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html}...")
633
+ logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
634
+
635
+ # Décrire les images
636
+ for image_key in images_data:
637
+ context = get_context_for_image(cleaned_html, image_key)
638
+ prompt = (
639
+ "Décris ce que l'on peut voir sur cette image, pour qu'un lecteur malvoyant puisse comprendre ce qu'elle représente."
640
+ "\nJe vais maintenant te donner les dernières phrases qui précèdent cette image. "
641
+ "Prends en compte ce contexte pour l'interpréter :\n\"" + context + "\""
642
+ )
643
+ images_data[image_key]['prompt'] = prompt
644
+
645
+ tasks = []
646
+ for image_key in images_data:
647
+ base64_image = images_data[image_key]['base64_image']
648
+ prompt = images_data[image_key]['prompt']
649
+ tasks.append((image_key, asyncio.create_task(get_image_description(base64_image, prompt))))
650
+
651
+ results = await asyncio.gather(*(t[1] for t in tasks))
652
+ for (image_key, _), description in zip(tasks, results):
653
+ images_data[image_key]['description'] = description
654
+
655
+ # Réécriture accessible
656
+ rewritten_html = await rewrite_html_accessible(cleaned_html)
657
+
658
+ # Réinsertion des images
659
+ final_html = reinsert_images(rewritten_html, images_data)
660
+
661
+ # Nettoyage final
662
+ final_soup = BeautifulSoup(final_html, 'html.parser')
663
+ scripts_to_remove = final_soup.find_all('script', src=True)
664
+ for script in scripts_to_remove:
665
+ if script['src'].startswith('https://bentham-converttohtml.hf.space/'):
666
+ script.decompose()
667
+ final_html = str(final_soup)
668
+
669
+ # Supprimer lignes contenant ```
670
+ final_html = re.sub(r'^\s*```(?:html)?\s*$', '', final_html, flags=re.MULTILINE)
671
+
672
+ # Insérer le CSS
673
+ final_html = insert_css_into_html(final_html)
674
+
675
+ output_filename = os.path.join(job_dir, f"{base_filename}.html")
676
+ with open(output_filename, 'w', encoding='utf-8') as f:
677
+ f.write(final_html)
678
+
679
+ update_job_status(job_id, 'completed', 'Traitement terminé', result_file=f"{base_filename}.html")
680
+ delete_files_after_delay([input_file_path], delay=6000)
681
+
682
+ except Exception as e:
683
+ logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
684
+ update_job_status(job_id, 'error', f"Erreur: {str(e)}")
685
+
 
 
 
 
 
 
 
 
 
 
 
 
 
686
 
687
 
688
  @app.get("/status/{job_id}")