Bentham commited on
Commit
92bfb62
·
verified ·
1 Parent(s): 881bf2e

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +90 -90
main.py CHANGED
@@ -592,96 +592,96 @@ async def convert_file_to_html(
592
  logging.error(f"Erreur lors du démarrage du job : {str(e)}")
593
  return JSONResponse(status_code=500, content={"message": f"Erreur lors du démarrage du job : {str(e)}"})
594
 
595
- async def process_file_accessibility(job_id: str, input_file_path: str, ext: str, original_filename: str):
596
- job_dir = os.path.join(JOBS_DIR, job_id)
597
- try:
598
- update_job_status(job_id, 'processing', 'Le fichier est en cours de traitement')
599
- base_filename = os.path.splitext(original_filename)[0]
600
-
601
- # Conversion en HTML sans extraire les images à ce stade
602
- if ext == '.pdf':
603
- html_content = pdf_to_html(input_file_path)
604
- elif ext in ['.ppt', '.pptx']:
605
- text = convert_ppt_to_text(input_file_path)
606
- html_content = text_to_html(text)
607
- elif ext == '.doc':
608
- text = convert_doc_to_text(input_file_path)
609
- html_content = text_to_html(text)
610
- elif ext in ['.html', '.htm']:
611
- with open(input_file_path, 'r', encoding='utf-8') as f:
612
- raw_html = f.read()
613
- try:
614
- doc = Document(raw_html)
615
- cleaned = doc.summary()
616
- html_content = cleaned
617
- except Exception as e:
618
- logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
619
- html_content = raw_html
620
- else:
621
- input_format = get_pandoc_format(ext)
622
- html_content = convert_with_pandoc(input_file_path, input_format)
623
-
624
- if ext == '.docx':
625
- logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
626
-
627
- # Nettoyage et extraction des images
628
- image_counter = [1]
629
- images_data = {}
630
- logging.debug(f"DEBUG ACCESSIBILITY: HTML avant clean_html_content : {html_content[:500]}...")
631
- cleaned_html = await clean_html_content(html_content, image_counter, images_data)
632
- logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html}...")
633
- logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
634
-
635
- # Décrire les images
636
- for image_key in images_data:
637
- context = get_context_for_image(cleaned_html, image_key)
638
- prompt = (
639
- "Décris ce que l'on peut voir sur cette image, pour qu'un lecteur malvoyant puisse comprendre ce qu'elle représente."
640
- "\nJe vais maintenant te donner les dernières phrases qui précèdent cette image. "
641
- "Prends en compte ce contexte pour l'interpréter :\n\"" + context + "\""
642
- )
643
- images_data[image_key]['prompt'] = prompt
644
-
645
- tasks = []
646
- for image_key in images_data:
647
- base64_image = images_data[image_key]['base64_image']
648
- prompt = images_data[image_key]['prompt']
649
- tasks.append((image_key, asyncio.create_task(get_image_description(base64_image, prompt))))
650
-
651
- results = await asyncio.gather(*(t[1] for t in tasks))
652
- for (image_key, _), description in zip(tasks, results):
653
- images_data[image_key]['description'] = description
654
-
655
- # Réécriture accessible
656
- rewritten_html = await rewrite_html_accessible(cleaned_html)
657
-
658
- # Réinsertion des images
659
- final_html = reinsert_images(rewritten_html, images_data)
660
-
661
- # Nettoyage final
662
- final_soup = BeautifulSoup(final_html, 'html.parser')
663
- scripts_to_remove = final_soup.find_all('script', src=True)
664
- for script in scripts_to_remove:
665
- if script['src'].startswith('https://bentham-converttohtml.hf.space/'):
666
- script.decompose()
667
- final_html = str(final_soup)
668
-
669
- # Supprimer lignes contenant ```
670
- final_html = re.sub(r'^\s*```(?:html)?\s*$', '', final_html, flags=re.MULTILINE)
671
-
672
- # Insérer le CSS
673
- final_html = insert_css_into_html(final_html)
674
-
675
- output_filename = os.path.join(job_dir, f"{base_filename}.html")
676
- with open(output_filename, 'w', encoding='utf-8') as f:
677
- f.write(final_html)
678
-
679
- update_job_status(job_id, 'completed', 'Traitement terminé', result_file=f"{base_filename}.html")
680
- delete_files_after_delay([input_file_path], delay=6000)
681
-
682
- except Exception as e:
683
- logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
684
- update_job_status(job_id, 'error', f"Erreur: {str(e)}")
685
 
686
 
687
 
 
592
  logging.error(f"Erreur lors du démarrage du job : {str(e)}")
593
  return JSONResponse(status_code=500, content={"message": f"Erreur lors du démarrage du job : {str(e)}"})
594
 
595
+ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str, original_filename: str):
596
+ job_dir = os.path.join(JOBS_DIR, job_id)
597
+ try:
598
+ update_job_status(job_id, 'processing', 'Le fichier est en cours de traitement')
599
+ base_filename = os.path.splitext(original_filename)[0]
600
+
601
+ # Conversion en HTML sans extraire les images à ce stade
602
+ if ext == '.pdf':
603
+ html_content = pdf_to_html(input_file_path)
604
+ elif ext in ['.ppt', '.pptx']:
605
+ text = convert_ppt_to_text(input_file_path)
606
+ html_content = text_to_html(text)
607
+ elif ext == '.doc':
608
+ text = convert_doc_to_text(input_file_path)
609
+ html_content = text_to_html(text)
610
+ elif ext in ['.html', '.htm']:
611
+ with open(input_file_path, 'r', encoding='utf-8') as f:
612
+ raw_html = f.read()
613
+ try:
614
+ doc = Document(raw_html)
615
+ cleaned = doc.summary()
616
+ html_content = cleaned
617
+ except Exception as e:
618
+ logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
619
+ html_content = raw_html
620
+ else:
621
+ input_format = get_pandoc_format(ext)
622
+ html_content = convert_with_pandoc(input_file_path, input_format)
623
+
624
+ if ext == '.docx':
625
+ logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
626
+
627
+ # Nettoyage et extraction des images
628
+ image_counter = [1]
629
+ images_data = {}
630
+ logging.debug(f"DEBUG ACCESSIBILITY: HTML avant clean_html_content : {html_content[:500]}...")
631
+ cleaned_html = await clean_html_content(html_content, image_counter, images_data)
632
+ logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html}...")
633
+ logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
634
+
635
+ # Décrire les images
636
+ for image_key in images_data:
637
+ context = get_context_for_image(cleaned_html, image_key)
638
+ prompt = (
639
+ "Décris ce que l'on peut voir sur cette image, pour qu'un lecteur malvoyant puisse comprendre ce qu'elle représente."
640
+ "\nJe vais maintenant te donner les dernières phrases qui précèdent cette image. "
641
+ "Prends en compte ce contexte pour l'interpréter :\n\"" + context + "\""
642
+ )
643
+ images_data[image_key]['prompt'] = prompt
644
+
645
+ tasks = []
646
+ for image_key in images_data:
647
+ base64_image = images_data[image_key]['base64_image']
648
+ prompt = images_data[image_key]['prompt']
649
+ tasks.append((image_key, asyncio.create_task(get_image_description(base64_image, prompt))))
650
+
651
+ results = await asyncio.gather(*(t[1] for t in tasks))
652
+ for (image_key, _), description in zip(tasks, results):
653
+ images_data[image_key]['description'] = description
654
+
655
+ # Réécriture accessible
656
+ rewritten_html = await rewrite_html_accessible(cleaned_html)
657
+
658
+ # Réinsertion des images
659
+ final_html = reinsert_images(rewritten_html, images_data)
660
+
661
+ # Nettoyage final
662
+ final_soup = BeautifulSoup(final_html, 'html.parser')
663
+ scripts_to_remove = final_soup.find_all('script', src=True)
664
+ for script in scripts_to_remove:
665
+ if script['src'].startswith('https://bentham-converttohtml.hf.space/'):
666
+ script.decompose()
667
+ final_html = str(final_soup)
668
+
669
+ # Supprimer lignes contenant ```
670
+ final_html = re.sub(r'^\s*```(?:html)?\s*$', '', final_html, flags=re.MULTILINE)
671
+
672
+ # Insérer le CSS
673
+ final_html = insert_css_into_html(final_html)
674
+
675
+ output_filename = os.path.join(job_dir, f"{base_filename}.html")
676
+ with open(output_filename, 'w', encoding='utf-8') as f:
677
+ f.write(final_html)
678
+
679
+ update_job_status(job_id, 'completed', 'Traitement terminé', result_file=f"{base_filename}.html")
680
+ delete_files_after_delay([input_file_path], delay=6000)
681
+
682
+ except Exception as e:
683
+ logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
684
+ update_job_status(job_id, 'error', f"Erreur: {str(e)}")
685
 
686
 
687