Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
@@ -370,6 +370,8 @@ def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
|
|
370 |
logging.error(f"Erreur lors du nettoyage du fichier HTML {input_filepath} : {str(e)}")
|
371 |
return False
|
372 |
|
|
|
|
|
373 |
async def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> str:
|
374 |
soup = BeautifulSoup(html_content, 'html.parser')
|
375 |
logging.debug(f"DEBUG CLEAN_HTML: Début de clean_html_content")
|
@@ -404,11 +406,11 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
|
|
404 |
images_data[f"IMG_{X}"] = {
|
405 |
'base64_image': base64_image
|
406 |
}
|
407 |
-
|
408 |
comment_tag = Comment(f"IMG_{X}")
|
409 |
img.insert_before(comment_tag)
|
410 |
logging.debug(f"DEBUG CLEAN_HTML: Insertion du commentaire avant l'image : {comment_tag}")
|
411 |
-
img.decompose()
|
412 |
logging.debug(f"DEBUG CLEAN_HTML: Suppression de la balise img.")
|
413 |
image_counter[0] += 1
|
414 |
else:
|
@@ -420,6 +422,9 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
|
|
420 |
else:
|
421 |
logging.debug("DEBUG CLEAN_HTML: Aucune balise <img> trouvée dans le contenu HTML.")
|
422 |
|
|
|
|
|
|
|
423 |
for img in soup.find_all('img'):
|
424 |
img.decompose()
|
425 |
|
@@ -436,6 +441,7 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
|
|
436 |
logging.debug(f"DEBUG CLEAN_HTML: Fin de clean_html_content")
|
437 |
return str(soup)
|
438 |
|
|
|
439 |
def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
|
440 |
soup = BeautifulSoup(html_content, 'html.parser')
|
441 |
|
@@ -447,7 +453,7 @@ def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -
|
|
447 |
if image_key in images_data:
|
448 |
img_tag = soup.new_tag('img')
|
449 |
img_tag['src'] = f"data:image/jpeg;base64,{images_data[image_key]['base64_image']}"
|
450 |
-
img_tag['alt'] = images_data[image_key]
|
451 |
|
452 |
new_content = soup.new_tag('div', attrs={'class': 'image-block'})
|
453 |
new_content.append(img_tag)
|
@@ -456,11 +462,10 @@ def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -
|
|
456 |
description = images_data[image_key]['description'].replace('\n', ' ').strip()
|
457 |
description = re.sub(r'\s+', ' ', description)
|
458 |
p_tag.string = f"Image {image_number} : {description}"
|
459 |
-
p_tag.append("\n")
|
460 |
new_content.append(p_tag)
|
461 |
|
462 |
# Ajout d'un journal pour vérifier l'insertion
|
463 |
-
logging.debug(f"Inserting description for {image_key}: {p_tag.
|
464 |
|
465 |
comment.replace_with(new_content)
|
466 |
else:
|
|
|
370 |
logging.error(f"Erreur lors du nettoyage du fichier HTML {input_filepath} : {str(e)}")
|
371 |
return False
|
372 |
|
373 |
+
from bs4 import Comment
|
374 |
+
|
375 |
async def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> str:
|
376 |
soup = BeautifulSoup(html_content, 'html.parser')
|
377 |
logging.debug(f"DEBUG CLEAN_HTML: Début de clean_html_content")
|
|
|
406 |
images_data[f"IMG_{X}"] = {
|
407 |
'base64_image': base64_image
|
408 |
}
|
409 |
+
# Création d'un véritable nœud de commentaire
|
410 |
comment_tag = Comment(f"IMG_{X}")
|
411 |
img.insert_before(comment_tag)
|
412 |
logging.debug(f"DEBUG CLEAN_HTML: Insertion du commentaire avant l'image : {comment_tag}")
|
413 |
+
img.decompose() # Supprimer la balise img
|
414 |
logging.debug(f"DEBUG CLEAN_HTML: Suppression de la balise img.")
|
415 |
image_counter[0] += 1
|
416 |
else:
|
|
|
422 |
else:
|
423 |
logging.debug("DEBUG CLEAN_HTML: Aucune balise <img> trouvée dans le contenu HTML.")
|
424 |
|
425 |
+
# Vérifiez que les commentaires sont bien insérés
|
426 |
+
logging.debug(f"DEBUG CLEAN_HTML: HTML après insertion des commentaires IMG_X : {str(soup)[:500]}...")
|
427 |
+
|
428 |
for img in soup.find_all('img'):
|
429 |
img.decompose()
|
430 |
|
|
|
441 |
logging.debug(f"DEBUG CLEAN_HTML: Fin de clean_html_content")
|
442 |
return str(soup)
|
443 |
|
444 |
+
|
445 |
def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
|
446 |
soup = BeautifulSoup(html_content, 'html.parser')
|
447 |
|
|
|
453 |
if image_key in images_data:
|
454 |
img_tag = soup.new_tag('img')
|
455 |
img_tag['src'] = f"data:image/jpeg;base64,{images_data[image_key]['base64_image']}"
|
456 |
+
img_tag['alt'] = images_data[image_key].get('description', 'Description indisponible')
|
457 |
|
458 |
new_content = soup.new_tag('div', attrs={'class': 'image-block'})
|
459 |
new_content.append(img_tag)
|
|
|
462 |
description = images_data[image_key]['description'].replace('\n', ' ').strip()
|
463 |
description = re.sub(r'\s+', ' ', description)
|
464 |
p_tag.string = f"Image {image_number} : {description}"
|
|
|
465 |
new_content.append(p_tag)
|
466 |
|
467 |
# Ajout d'un journal pour vérifier l'insertion
|
468 |
+
logging.debug(f"Inserting description for {image_key}: {p_tag.get_text(strip=True)}")
|
469 |
|
470 |
comment.replace_with(new_content)
|
471 |
else:
|