Spaces:
Sleeping
Sleeping
debug gemini
Browse files
main.py
CHANGED
@@ -594,6 +594,7 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
|
|
594 |
text = convert_doc_to_text(input_file_path)
|
595 |
html_content = text_to_html(text)
|
596 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
|
|
597 |
elif ext in ['.html', '.htm']:
|
598 |
with open(input_file_path, 'r', encoding='utf-8') as f:
|
599 |
raw_html = f.read()
|
@@ -613,8 +614,10 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
|
|
613 |
# Nettoyage et extraction des images
|
614 |
image_counter = [1]
|
615 |
images_data = {}
|
|
|
616 |
cleaned_html = await clean_html_content(html_content, image_counter, images_data)
|
617 |
-
logging.debug(f"HTML
|
|
|
618 |
|
619 |
|
620 |
# Décrire les images
|
@@ -750,18 +753,22 @@ async def convert_file_to_txt(
|
|
750 |
else:
|
751 |
input_format = get_pandoc_format(ext)
|
752 |
html_content = convert_with_pandoc(input_filename, input_format)
|
753 |
-
|
|
|
|
|
|
|
754 |
# Nettoyage et extraction des images
|
755 |
image_counter = [1]
|
756 |
images_data = {}
|
|
|
757 |
cleaned_html = await clean_html_content(html_content, image_counter, images_data)
|
|
|
|
|
758 |
|
759 |
# Vérification des commentaires IMG_X dans le HTML nettoyé
|
760 |
cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
|
761 |
-
|
762 |
-
|
763 |
-
else:
|
764 |
-
logging.error("Les commentaires IMG_X ne sont PAS présents dans le HTML nettoyé.")
|
765 |
|
766 |
# Description des images pour le mode texte
|
767 |
for image_key in images_data:
|
@@ -787,13 +794,15 @@ async def convert_file_to_txt(
|
|
787 |
images_data[image_key]['description'] = description
|
788 |
|
789 |
# Réinsertion des images avec descriptions
|
|
|
|
|
790 |
final_html = reinsert_images(cleaned_html, images_data)
|
791 |
-
logging.debug(f"HTML
|
792 |
|
793 |
# Vérification des descriptions insérées
|
794 |
soup_final = BeautifulSoup(final_html, 'html.parser')
|
795 |
description_paragraphs = soup_final.find_all('p', class_='description')
|
796 |
-
logging.debug(f"Nombre de descriptions insérées : {len(description_paragraphs)}")
|
797 |
for desc in description_paragraphs:
|
798 |
logging.debug(f"Description insérée : {desc.get_text(strip=True)}")
|
799 |
|
|
|
594 |
text = convert_doc_to_text(input_file_path)
|
595 |
html_content = text_to_html(text)
|
596 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
597 |
+
logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
598 |
elif ext in ['.html', '.htm']:
|
599 |
with open(input_file_path, 'r', encoding='utf-8') as f:
|
600 |
raw_html = f.read()
|
|
|
614 |
# Nettoyage et extraction des images
|
615 |
image_counter = [1]
|
616 |
images_data = {}
|
617 |
+
logging.debug(f"DEBUG ACCESSIBILITY: HTML avant clean_html_content : {html_content[:500]}...")
|
618 |
cleaned_html = await clean_html_content(html_content, image_counter, images_data)
|
619 |
+
logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html[:500]}...")
|
620 |
+
logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
|
621 |
|
622 |
|
623 |
# Décrire les images
|
|
|
753 |
else:
|
754 |
input_format = get_pandoc_format(ext)
|
755 |
html_content = convert_with_pandoc(input_filename, input_format)
|
756 |
+
|
757 |
+
if ext == '.docx':
|
758 |
+
logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
759 |
+
|
760 |
# Nettoyage et extraction des images
|
761 |
image_counter = [1]
|
762 |
images_data = {}
|
763 |
+
logging.debug(f"DEBUG CONVERT_TO_TXT: HTML avant clean_html_content : {html_content[:500]}...")
|
764 |
cleaned_html = await clean_html_content(html_content, image_counter, images_data)
|
765 |
+
logging.debug(f"DEBUG CONVERT_TO_TXT: HTML après clean_html_content : {cleaned_html[:500]}...")
|
766 |
+
logging.debug(f"DEBUG CONVERT_TO_TXT: images_data après clean_html_content : {images_data}")
|
767 |
|
768 |
# Vérification des commentaires IMG_X dans le HTML nettoyé
|
769 |
cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
|
770 |
+
has_img_comments = any(re.match(r'IMG_\d+', str(comment)) for comment in cleaned_soup.find_all(string=lambda text: isinstance(text, Comment)))
|
771 |
+
logging.debug(f"DEBUG CONVERT_TO_TXT: Présence de commentaires IMG_X après nettoyage : {has_img_comments}")
|
|
|
|
|
772 |
|
773 |
# Description des images pour le mode texte
|
774 |
for image_key in images_data:
|
|
|
794 |
images_data[image_key]['description'] = description
|
795 |
|
796 |
# Réinsertion des images avec descriptions
|
797 |
+
logging.debug(f"DEBUG CONVERT_TO_TXT: HTML avant reinsert_images : {cleaned_html[:500]}...")
|
798 |
+
logging.debug(f"DEBUG CONVERT_TO_TXT: images_data avant reinsert_images : {images_data}")
|
799 |
final_html = reinsert_images(cleaned_html, images_data)
|
800 |
+
logging.debug(f"DEBUG CONVERT_TO_TXT: HTML après reinsert_images : {final_html[:500]}...")
|
801 |
|
802 |
# Vérification des descriptions insérées
|
803 |
soup_final = BeautifulSoup(final_html, 'html.parser')
|
804 |
description_paragraphs = soup_final.find_all('p', class_='description')
|
805 |
+
logging.debug(f"DEBUG CONVERT_TO_TXT: Nombre de descriptions insérées : {len(description_paragraphs)}")
|
806 |
for desc in description_paragraphs:
|
807 |
logging.debug(f"Description insérée : {desc.get_text(strip=True)}")
|
808 |
|