convert_test

Sleeping

App Files Files Community

Bentham commited on Dec 20, 2024

Commit

72873ed

verified ·

1 Parent(s): 84fbe33

Update main.py

Browse files

Files changed (1) hide show

main.py +10 -5

main.py CHANGED Viewed

@@ -370,6 +370,8 @@ def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
         logging.error(f"Erreur lors du nettoyage du fichier HTML {input_filepath} : {str(e)}")
         return False
 async def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> str:
     soup = BeautifulSoup(html_content, 'html.parser')
     logging.debug(f"DEBUG CLEAN_HTML: Début de clean_html_content")
@@ -404,11 +406,11 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
                         images_data[f"IMG_{X}"] = {
                             'base64_image': base64_image
                         }
-                        placeholder = f"<!--IMG_{X}-->"
                         comment_tag = Comment(f"IMG_{X}")
                         img.insert_before(comment_tag)
                         logging.debug(f"DEBUG CLEAN_HTML: Insertion du commentaire avant l'image : {comment_tag}")
-                        img.decompose() # Remove the image tag
                         logging.debug(f"DEBUG CLEAN_HTML: Suppression de la balise img.")
                         image_counter[0] += 1
                     else:
@@ -420,6 +422,9 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
     else:
         logging.debug("DEBUG CLEAN_HTML: Aucune balise <img> trouvée dans le contenu HTML.")
     for img in soup.find_all('img'):
         img.decompose()
@@ -436,6 +441,7 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
     logging.debug(f"DEBUG CLEAN_HTML: Fin de clean_html_content")
     return str(soup)
 def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
     soup = BeautifulSoup(html_content, 'html.parser')
@@ -447,7 +453,7 @@ def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -
             if image_key in images_data:
                 img_tag = soup.new_tag('img')
                 img_tag['src'] = f"data:image/jpeg;base64,{images_data[image_key]['base64_image']}"
-                img_tag['alt'] = images_data[image_key]['description']
                 new_content = soup.new_tag('div', attrs={'class': 'image-block'})
                 new_content.append(img_tag)
@@ -456,11 +462,10 @@ def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -
                 description = images_data[image_key]['description'].replace('\n', ' ').strip()
                 description = re.sub(r'\s+', ' ', description)
                 p_tag.string = f"Image {image_number} : {description}"
-                p_tag.append("\n")
                 new_content.append(p_tag)
                 # Ajout d'un journal pour vérifier l'insertion
-                logging.debug(f"Inserting description for {image_key}: {p_tag.string}")
                 comment.replace_with(new_content)
             else:

         logging.error(f"Erreur lors du nettoyage du fichier HTML {input_filepath} : {str(e)}")
         return False
+from bs4 import Comment
 async def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> str:
     soup = BeautifulSoup(html_content, 'html.parser')
     logging.debug(f"DEBUG CLEAN_HTML: Début de clean_html_content")
                         images_data[f"IMG_{X}"] = {
                             'base64_image': base64_image
                         }
+                        # Création d'un véritable nœud de commentaire
                         comment_tag = Comment(f"IMG_{X}")
                         img.insert_before(comment_tag)
                         logging.debug(f"DEBUG CLEAN_HTML: Insertion du commentaire avant l'image : {comment_tag}")
+                        img.decompose()  # Supprimer la balise img
                         logging.debug(f"DEBUG CLEAN_HTML: Suppression de la balise img.")
                         image_counter[0] += 1
                     else:
     else:
         logging.debug("DEBUG CLEAN_HTML: Aucune balise <img> trouvée dans le contenu HTML.")
+    # Vérifiez que les commentaires sont bien insérés
+    logging.debug(f"DEBUG CLEAN_HTML: HTML après insertion des commentaires IMG_X : {str(soup)[:500]}...")
     for img in soup.find_all('img'):
         img.decompose()
     logging.debug(f"DEBUG CLEAN_HTML: Fin de clean_html_content")
     return str(soup)
 def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
     soup = BeautifulSoup(html_content, 'html.parser')
             if image_key in images_data:
                 img_tag = soup.new_tag('img')
                 img_tag['src'] = f"data:image/jpeg;base64,{images_data[image_key]['base64_image']}"
+                img_tag['alt'] = images_data[image_key].get('description', 'Description indisponible')
                 new_content = soup.new_tag('div', attrs={'class': 'image-block'})
                 new_content.append(img_tag)
                 description = images_data[image_key]['description'].replace('\n', ' ').strip()
                 description = re.sub(r'\s+', ' ', description)
                 p_tag.string = f"Image {image_number} : {description}"
                 new_content.append(p_tag)
                 # Ajout d'un journal pour vérifier l'insertion
+                logging.debug(f"Inserting description for {image_key}: {p_tag.get_text(strip=True)}")
                 comment.replace_with(new_content)
             else: