Bentham commited on
Commit
72873ed
·
verified ·
1 Parent(s): 84fbe33

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +10 -5
main.py CHANGED
@@ -370,6 +370,8 @@ def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
370
  logging.error(f"Erreur lors du nettoyage du fichier HTML {input_filepath} : {str(e)}")
371
  return False
372
 
 
 
373
  async def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> str:
374
  soup = BeautifulSoup(html_content, 'html.parser')
375
  logging.debug(f"DEBUG CLEAN_HTML: Début de clean_html_content")
@@ -404,11 +406,11 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
404
  images_data[f"IMG_{X}"] = {
405
  'base64_image': base64_image
406
  }
407
- placeholder = f"<!--IMG_{X}-->"
408
  comment_tag = Comment(f"IMG_{X}")
409
  img.insert_before(comment_tag)
410
  logging.debug(f"DEBUG CLEAN_HTML: Insertion du commentaire avant l'image : {comment_tag}")
411
- img.decompose() # Remove the image tag
412
  logging.debug(f"DEBUG CLEAN_HTML: Suppression de la balise img.")
413
  image_counter[0] += 1
414
  else:
@@ -420,6 +422,9 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
420
  else:
421
  logging.debug("DEBUG CLEAN_HTML: Aucune balise <img> trouvée dans le contenu HTML.")
422
 
 
 
 
423
  for img in soup.find_all('img'):
424
  img.decompose()
425
 
@@ -436,6 +441,7 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
436
  logging.debug(f"DEBUG CLEAN_HTML: Fin de clean_html_content")
437
  return str(soup)
438
 
 
439
  def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
440
  soup = BeautifulSoup(html_content, 'html.parser')
441
 
@@ -447,7 +453,7 @@ def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -
447
  if image_key in images_data:
448
  img_tag = soup.new_tag('img')
449
  img_tag['src'] = f"data:image/jpeg;base64,{images_data[image_key]['base64_image']}"
450
- img_tag['alt'] = images_data[image_key]['description']
451
 
452
  new_content = soup.new_tag('div', attrs={'class': 'image-block'})
453
  new_content.append(img_tag)
@@ -456,11 +462,10 @@ def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -
456
  description = images_data[image_key]['description'].replace('\n', ' ').strip()
457
  description = re.sub(r'\s+', ' ', description)
458
  p_tag.string = f"Image {image_number} : {description}"
459
- p_tag.append("\n")
460
  new_content.append(p_tag)
461
 
462
  # Ajout d'un journal pour vérifier l'insertion
463
- logging.debug(f"Inserting description for {image_key}: {p_tag.string}")
464
 
465
  comment.replace_with(new_content)
466
  else:
 
370
  logging.error(f"Erreur lors du nettoyage du fichier HTML {input_filepath} : {str(e)}")
371
  return False
372
 
373
+ from bs4 import Comment
374
+
375
  async def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> str:
376
  soup = BeautifulSoup(html_content, 'html.parser')
377
  logging.debug(f"DEBUG CLEAN_HTML: Début de clean_html_content")
 
406
  images_data[f"IMG_{X}"] = {
407
  'base64_image': base64_image
408
  }
409
+ # Création d'un véritable nœud de commentaire
410
  comment_tag = Comment(f"IMG_{X}")
411
  img.insert_before(comment_tag)
412
  logging.debug(f"DEBUG CLEAN_HTML: Insertion du commentaire avant l'image : {comment_tag}")
413
+ img.decompose() # Supprimer la balise img
414
  logging.debug(f"DEBUG CLEAN_HTML: Suppression de la balise img.")
415
  image_counter[0] += 1
416
  else:
 
422
  else:
423
  logging.debug("DEBUG CLEAN_HTML: Aucune balise <img> trouvée dans le contenu HTML.")
424
 
425
+ # Vérifiez que les commentaires sont bien insérés
426
+ logging.debug(f"DEBUG CLEAN_HTML: HTML après insertion des commentaires IMG_X : {str(soup)[:500]}...")
427
+
428
  for img in soup.find_all('img'):
429
  img.decompose()
430
 
 
441
  logging.debug(f"DEBUG CLEAN_HTML: Fin de clean_html_content")
442
  return str(soup)
443
 
444
+
445
  def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
446
  soup = BeautifulSoup(html_content, 'html.parser')
447
 
 
453
  if image_key in images_data:
454
  img_tag = soup.new_tag('img')
455
  img_tag['src'] = f"data:image/jpeg;base64,{images_data[image_key]['base64_image']}"
456
+ img_tag['alt'] = images_data[image_key].get('description', 'Description indisponible')
457
 
458
  new_content = soup.new_tag('div', attrs={'class': 'image-block'})
459
  new_content.append(img_tag)
 
462
  description = images_data[image_key]['description'].replace('\n', ' ').strip()
463
  description = re.sub(r'\s+', ' ', description)
464
  p_tag.string = f"Image {image_number} : {description}"
 
465
  new_content.append(p_tag)
466
 
467
  # Ajout d'un journal pour vérifier l'insertion
468
+ logging.debug(f"Inserting description for {image_key}: {p_tag.get_text(strip=True)}")
469
 
470
  comment.replace_with(new_content)
471
  else: