convert_test

Sleeping

App Files Files Community

Bentham commited on Dec 20, 2024

Commit

6a78e7d

verified ·

1 Parent(s): 33c217c

re-debug google

Browse files

Files changed (1) hide show

main.py +19 -2

main.py CHANGED Viewed

@@ -372,6 +372,7 @@ def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
 async def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> str:
     soup = BeautifulSoup(html_content, 'html.parser')
     for tag in soup.find_all():
         if 'style' in tag.attrs:
@@ -384,6 +385,8 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
         span.unwrap()
     img_tags = soup.find_all('img')
     if img_tags:
         if len(img_tags) > 20:
             logging.warning(f"Number of images ({len(img_tags)}) exceeds 20. Images will be ignored.")
@@ -392,22 +395,27 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
         else:
             for img in img_tags:
                 src = img.get('src', '')
                 X = image_counter[0]
                 if src.startswith('data:image/'):
                     base64_image = encode_image_from_data_uri(src)
                     if base64_image:
                         images_data[f"IMG_{X}"] = {
                             'base64_image': base64_image
                         }
                         placeholder = f"<!--IMG_{X}-->"
                         img.replace_with(BeautifulSoup(placeholder, 'html.parser'))
                         image_counter[0] += 1
                     else:
                         img.decompose()
                 else:
                     img.decompose()
     else:
-        logging.debug("No <img> tags found in the HTML content.")
     for img in soup.find_all('img'):
         img.decompose()
@@ -422,6 +430,7 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
         if not tag.get_text(strip=True):
             tag.decompose()
     return str(soup)
 def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
@@ -611,6 +620,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
             html_content = convert_with_pandoc(input_file_path, input_format)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
         # Nettoyage et extraction des images
         image_counter = [1]
         images_data = {}
@@ -619,6 +631,8 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
         logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html[:500]}...")
         logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
         # Décrire les images
         for image_key in images_data:
@@ -756,7 +770,7 @@ async def convert_file_to_txt(
         if ext == '.docx':
             logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
         # Nettoyage et extraction des images
         image_counter = [1]
         images_data = {}
@@ -765,6 +779,9 @@ async def convert_file_to_txt(
         logging.debug(f"DEBUG CONVERT_TO_TXT: HTML après clean_html_content : {cleaned_html[:500]}...")
         logging.debug(f"DEBUG CONVERT_TO_TXT: images_data après clean_html_content : {images_data}")
         # Vérification des commentaires IMG_X dans le HTML nettoyé
         cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
         has_img_comments = any(re.match(r'IMG_\d+', str(comment)) for comment in cleaned_soup.find_all(string=lambda text: isinstance(text, Comment)))

 async def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> str:
     soup = BeautifulSoup(html_content, 'html.parser')
+    logging.debug(f"DEBUG CLEAN_HTML: Début de clean_html_content")
     for tag in soup.find_all():
         if 'style' in tag.attrs:
         span.unwrap()
     img_tags = soup.find_all('img')
+    logging.debug(f"DEBUG CLEAN_HTML: Nombre de balises <img> trouvées : {len(img_tags)}")
     if img_tags:
         if len(img_tags) > 20:
             logging.warning(f"Number of images ({len(img_tags)}) exceeds 20. Images will be ignored.")
         else:
             for img in img_tags:
                 src = img.get('src', '')
+                logging.debug(f"DEBUG CLEAN_HTML: Traitement de la balise <img> avec src='{src[:100]}...'") # Afficher le début du src
                 X = image_counter[0]
                 if src.startswith('data:image/'):
+                    logging.debug(f"DEBUG CLEAN_HTML: src commence par data:image/")
                     base64_image = encode_image_from_data_uri(src)
                     if base64_image:
                         images_data[f"IMG_{X}"] = {
                             'base64_image': base64_image
                         }
                         placeholder = f"<!--IMG_{X}-->"
+                        logging.debug(f"DEBUG CLEAN_HTML: Remplacement par le commentaire : {placeholder}")
                         img.replace_with(BeautifulSoup(placeholder, 'html.parser'))
                         image_counter[0] += 1
                     else:
+                        logging.debug(f"DEBUG CLEAN_HTML: Erreur lors de l'encodage base64, suppression de l'image.")
                         img.decompose()
                 else:
+                    logging.debug(f"DEBUG CLEAN_HTML: src ne commence PAS par data:image/, suppression de l'image.")
                     img.decompose()
     else:
+        logging.debug("DEBUG CLEAN_HTML: Aucune balise <img> trouvée dans le contenu HTML.")
     for img in soup.find_all('img'):
         img.decompose()
         if not tag.get_text(strip=True):
             tag.decompose()
+    logging.debug(f"DEBUG CLEAN_HTML: Fin de clean_html_content")
     return str(soup)
 def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
             html_content = convert_with_pandoc(input_file_path, input_format)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
+        if ext == '.docx':
+            logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
         # Nettoyage et extraction des images
         image_counter = [1]
         images_data = {}
         logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html[:500]}...")
         logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
+        if ext == '.docx':
+            logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
         # Décrire les images
         for image_key in images_data:
         if ext == '.docx':
             logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
         # Nettoyage et extraction des images
         image_counter = [1]
         images_data = {}
         logging.debug(f"DEBUG CONVERT_TO_TXT: HTML après clean_html_content : {cleaned_html[:500]}...")
         logging.debug(f"DEBUG CONVERT_TO_TXT: images_data après clean_html_content : {images_data}")
+        if ext == '.docx':
+            logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
         # Vérification des commentaires IMG_X dans le HTML nettoyé
         cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
         has_img_comments = any(re.match(r'IMG_\d+', str(comment)) for comment in cleaned_soup.find_all(string=lambda text: isinstance(text, Comment)))