Bentham commited on
Commit
6a78e7d
·
verified ·
1 Parent(s): 33c217c

re-debug google

Browse files
Files changed (1) hide show
  1. main.py +19 -2
main.py CHANGED
@@ -372,6 +372,7 @@ def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
372
 
373
  async def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> str:
374
  soup = BeautifulSoup(html_content, 'html.parser')
 
375
 
376
  for tag in soup.find_all():
377
  if 'style' in tag.attrs:
@@ -384,6 +385,8 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
384
  span.unwrap()
385
 
386
  img_tags = soup.find_all('img')
 
 
387
  if img_tags:
388
  if len(img_tags) > 20:
389
  logging.warning(f"Number of images ({len(img_tags)}) exceeds 20. Images will be ignored.")
@@ -392,22 +395,27 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
392
  else:
393
  for img in img_tags:
394
  src = img.get('src', '')
 
395
  X = image_counter[0]
396
  if src.startswith('data:image/'):
 
397
  base64_image = encode_image_from_data_uri(src)
398
  if base64_image:
399
  images_data[f"IMG_{X}"] = {
400
  'base64_image': base64_image
401
  }
402
  placeholder = f"<!--IMG_{X}-->"
 
403
  img.replace_with(BeautifulSoup(placeholder, 'html.parser'))
404
  image_counter[0] += 1
405
  else:
 
406
  img.decompose()
407
  else:
 
408
  img.decompose()
409
  else:
410
- logging.debug("No <img> tags found in the HTML content.")
411
 
412
  for img in soup.find_all('img'):
413
  img.decompose()
@@ -422,6 +430,7 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
422
  if not tag.get_text(strip=True):
423
  tag.decompose()
424
 
 
425
  return str(soup)
426
 
427
  def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
@@ -611,6 +620,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
611
  html_content = convert_with_pandoc(input_file_path, input_format)
612
  html_content = insert_page_comments_every_15_paragraphs(html_content)
613
 
 
 
 
614
  # Nettoyage et extraction des images
615
  image_counter = [1]
616
  images_data = {}
@@ -619,6 +631,8 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
619
  logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html[:500]}...")
620
  logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
621
 
 
 
622
 
623
  # Décrire les images
624
  for image_key in images_data:
@@ -756,7 +770,7 @@ async def convert_file_to_txt(
756
 
757
  if ext == '.docx':
758
  logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
759
-
760
  # Nettoyage et extraction des images
761
  image_counter = [1]
762
  images_data = {}
@@ -765,6 +779,9 @@ async def convert_file_to_txt(
765
  logging.debug(f"DEBUG CONVERT_TO_TXT: HTML après clean_html_content : {cleaned_html[:500]}...")
766
  logging.debug(f"DEBUG CONVERT_TO_TXT: images_data après clean_html_content : {images_data}")
767
 
 
 
 
768
  # Vérification des commentaires IMG_X dans le HTML nettoyé
769
  cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
770
  has_img_comments = any(re.match(r'IMG_\d+', str(comment)) for comment in cleaned_soup.find_all(string=lambda text: isinstance(text, Comment)))
 
372
 
373
  async def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> str:
374
  soup = BeautifulSoup(html_content, 'html.parser')
375
+ logging.debug(f"DEBUG CLEAN_HTML: Début de clean_html_content")
376
 
377
  for tag in soup.find_all():
378
  if 'style' in tag.attrs:
 
385
  span.unwrap()
386
 
387
  img_tags = soup.find_all('img')
388
+ logging.debug(f"DEBUG CLEAN_HTML: Nombre de balises <img> trouvées : {len(img_tags)}")
389
+
390
  if img_tags:
391
  if len(img_tags) > 20:
392
  logging.warning(f"Number of images ({len(img_tags)}) exceeds 20. Images will be ignored.")
 
395
  else:
396
  for img in img_tags:
397
  src = img.get('src', '')
398
+ logging.debug(f"DEBUG CLEAN_HTML: Traitement de la balise <img> avec src='{src[:100]}...'") # Afficher le début du src
399
  X = image_counter[0]
400
  if src.startswith('data:image/'):
401
+ logging.debug(f"DEBUG CLEAN_HTML: src commence par data:image/")
402
  base64_image = encode_image_from_data_uri(src)
403
  if base64_image:
404
  images_data[f"IMG_{X}"] = {
405
  'base64_image': base64_image
406
  }
407
  placeholder = f"<!--IMG_{X}-->"
408
+ logging.debug(f"DEBUG CLEAN_HTML: Remplacement par le commentaire : {placeholder}")
409
  img.replace_with(BeautifulSoup(placeholder, 'html.parser'))
410
  image_counter[0] += 1
411
  else:
412
+ logging.debug(f"DEBUG CLEAN_HTML: Erreur lors de l'encodage base64, suppression de l'image.")
413
  img.decompose()
414
  else:
415
+ logging.debug(f"DEBUG CLEAN_HTML: src ne commence PAS par data:image/, suppression de l'image.")
416
  img.decompose()
417
  else:
418
+ logging.debug("DEBUG CLEAN_HTML: Aucune balise <img> trouvée dans le contenu HTML.")
419
 
420
  for img in soup.find_all('img'):
421
  img.decompose()
 
430
  if not tag.get_text(strip=True):
431
  tag.decompose()
432
 
433
+ logging.debug(f"DEBUG CLEAN_HTML: Fin de clean_html_content")
434
  return str(soup)
435
 
436
  def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
 
620
  html_content = convert_with_pandoc(input_file_path, input_format)
621
  html_content = insert_page_comments_every_15_paragraphs(html_content)
622
 
623
+ if ext == '.docx':
624
+ logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
625
+
626
  # Nettoyage et extraction des images
627
  image_counter = [1]
628
  images_data = {}
 
631
  logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html[:500]}...")
632
  logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
633
 
634
+ if ext == '.docx':
635
+ logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
636
 
637
  # Décrire les images
638
  for image_key in images_data:
 
770
 
771
  if ext == '.docx':
772
  logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
773
+
774
  # Nettoyage et extraction des images
775
  image_counter = [1]
776
  images_data = {}
 
779
  logging.debug(f"DEBUG CONVERT_TO_TXT: HTML après clean_html_content : {cleaned_html[:500]}...")
780
  logging.debug(f"DEBUG CONVERT_TO_TXT: images_data après clean_html_content : {images_data}")
781
 
782
+ if ext == '.docx':
783
+ logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
784
+
785
  # Vérification des commentaires IMG_X dans le HTML nettoyé
786
  cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
787
  has_img_comments = any(re.match(r'IMG_\d+', str(comment)) for comment in cleaned_soup.find_all(string=lambda text: isinstance(text, Comment)))