convert_test

Sleeping

App Files Files Community

Bentham commited on Dec 20, 2024

Commit

65a1e31

verified ·

1 Parent(s): 1aae06b

debug gemini

Browse files

Files changed (1) hide show

main.py +17 -8

main.py CHANGED Viewed

@@ -594,6 +594,7 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
             text = convert_doc_to_text(input_file_path)
             html_content = text_to_html(text)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
         elif ext in ['.html', '.htm']:
             with open(input_file_path, 'r', encoding='utf-8') as f:
                 raw_html = f.read()
@@ -613,8 +614,10 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
         # Nettoyage et extraction des images
         image_counter = [1]
         images_data = {}
         cleaned_html = await clean_html_content(html_content, image_counter, images_data)
-        logging.debug(f"HTML nettoyé après extraction des images : {cleaned_html[:500]}...")  # Affiche les 500 premiers caractères
         # Décrire les images
@@ -750,18 +753,22 @@ async def convert_file_to_txt(
         else:
             input_format = get_pandoc_format(ext)
             html_content = convert_with_pandoc(input_filename, input_format)
         # Nettoyage et extraction des images
         image_counter = [1]
         images_data = {}
         cleaned_html = await clean_html_content(html_content, image_counter, images_data)
         # Vérification des commentaires IMG_X dans le HTML nettoyé
         cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
-        if any(re.match(r'IMG_\d+', str(comment)) for comment in cleaned_soup.find_all(string=lambda text: isinstance(text, Comment))):
-            logging.debug("Les commentaires IMG_X sont présents dans le HTML nettoyé.")
-        else:
-            logging.error("Les commentaires IMG_X ne sont PAS présents dans le HTML nettoyé.")
         # Description des images pour le mode texte
         for image_key in images_data:
@@ -787,13 +794,15 @@ async def convert_file_to_txt(
             images_data[image_key]['description'] = description
         # Réinsertion des images avec descriptions
         final_html = reinsert_images(cleaned_html, images_data)
-        logging.debug(f"HTML final après réinsertion des images : {final_html[:500]}...")  # Affiche les 500 premiers caractères
         # Vérification des descriptions insérées
         soup_final = BeautifulSoup(final_html, 'html.parser')
         description_paragraphs = soup_final.find_all('p', class_='description')
-        logging.debug(f"Nombre de descriptions insérées : {len(description_paragraphs)}")
         for desc in description_paragraphs:
             logging.debug(f"Description insérée : {desc.get_text(strip=True)}")

             text = convert_doc_to_text(input_file_path)
             html_content = text_to_html(text)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
+            logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
         elif ext in ['.html', '.htm']:
             with open(input_file_path, 'r', encoding='utf-8') as f:
                 raw_html = f.read()
         # Nettoyage et extraction des images
         image_counter = [1]
         images_data = {}
+        logging.debug(f"DEBUG ACCESSIBILITY: HTML avant clean_html_content : {html_content[:500]}...")
         cleaned_html = await clean_html_content(html_content, image_counter, images_data)
+        logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html[:500]}...")
+        logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
         # Décrire les images
         else:
             input_format = get_pandoc_format(ext)
             html_content = convert_with_pandoc(input_filename, input_format)
+        if ext == '.docx':
+        logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
         # Nettoyage et extraction des images
         image_counter = [1]
         images_data = {}
+        logging.debug(f"DEBUG CONVERT_TO_TXT: HTML avant clean_html_content : {html_content[:500]}...")
         cleaned_html = await clean_html_content(html_content, image_counter, images_data)
+        logging.debug(f"DEBUG CONVERT_TO_TXT: HTML après clean_html_content : {cleaned_html[:500]}...")
+        logging.debug(f"DEBUG CONVERT_TO_TXT: images_data après clean_html_content : {images_data}")
         # Vérification des commentaires IMG_X dans le HTML nettoyé
         cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
+        has_img_comments = any(re.match(r'IMG_\d+', str(comment)) for comment in cleaned_soup.find_all(string=lambda text: isinstance(text, Comment)))
+        logging.debug(f"DEBUG CONVERT_TO_TXT: Présence de commentaires IMG_X après nettoyage : {has_img_comments}")
         # Description des images pour le mode texte
         for image_key in images_data:
             images_data[image_key]['description'] = description
         # Réinsertion des images avec descriptions
+        logging.debug(f"DEBUG CONVERT_TO_TXT: HTML avant reinsert_images : {cleaned_html[:500]}...")
+        logging.debug(f"DEBUG CONVERT_TO_TXT: images_data avant reinsert_images : {images_data}")
         final_html = reinsert_images(cleaned_html, images_data)
+        logging.debug(f"DEBUG CONVERT_TO_TXT: HTML après reinsert_images : {final_html[:500]}...")
         # Vérification des descriptions insérées
         soup_final = BeautifulSoup(final_html, 'html.parser')
         description_paragraphs = soup_final.find_all('p', class_='description')
+        logging.debug(f"DEBUG CONVERT_TO_TXT: Nombre de descriptions insérées : {len(description_paragraphs)}")
         for desc in description_paragraphs:
             logging.debug(f"Description insérée : {desc.get_text(strip=True)}")