Spaces:
Sleeping
Sleeping
re-debug google
Browse files
main.py
CHANGED
@@ -372,6 +372,7 @@ def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
|
|
372 |
|
373 |
async def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> str:
|
374 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
375 |
|
376 |
for tag in soup.find_all():
|
377 |
if 'style' in tag.attrs:
|
@@ -384,6 +385,8 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
|
|
384 |
span.unwrap()
|
385 |
|
386 |
img_tags = soup.find_all('img')
|
|
|
|
|
387 |
if img_tags:
|
388 |
if len(img_tags) > 20:
|
389 |
logging.warning(f"Number of images ({len(img_tags)}) exceeds 20. Images will be ignored.")
|
@@ -392,22 +395,27 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
|
|
392 |
else:
|
393 |
for img in img_tags:
|
394 |
src = img.get('src', '')
|
|
|
395 |
X = image_counter[0]
|
396 |
if src.startswith('data:image/'):
|
|
|
397 |
base64_image = encode_image_from_data_uri(src)
|
398 |
if base64_image:
|
399 |
images_data[f"IMG_{X}"] = {
|
400 |
'base64_image': base64_image
|
401 |
}
|
402 |
placeholder = f"<!--IMG_{X}-->"
|
|
|
403 |
img.replace_with(BeautifulSoup(placeholder, 'html.parser'))
|
404 |
image_counter[0] += 1
|
405 |
else:
|
|
|
406 |
img.decompose()
|
407 |
else:
|
|
|
408 |
img.decompose()
|
409 |
else:
|
410 |
-
logging.debug("
|
411 |
|
412 |
for img in soup.find_all('img'):
|
413 |
img.decompose()
|
@@ -422,6 +430,7 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
|
|
422 |
if not tag.get_text(strip=True):
|
423 |
tag.decompose()
|
424 |
|
|
|
425 |
return str(soup)
|
426 |
|
427 |
def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
|
@@ -611,6 +620,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
|
|
611 |
html_content = convert_with_pandoc(input_file_path, input_format)
|
612 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
613 |
|
|
|
|
|
|
|
614 |
# Nettoyage et extraction des images
|
615 |
image_counter = [1]
|
616 |
images_data = {}
|
@@ -619,6 +631,8 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
|
|
619 |
logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html[:500]}...")
|
620 |
logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
|
621 |
|
|
|
|
|
622 |
|
623 |
# Décrire les images
|
624 |
for image_key in images_data:
|
@@ -756,7 +770,7 @@ async def convert_file_to_txt(
|
|
756 |
|
757 |
if ext == '.docx':
|
758 |
logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
759 |
-
|
760 |
# Nettoyage et extraction des images
|
761 |
image_counter = [1]
|
762 |
images_data = {}
|
@@ -765,6 +779,9 @@ async def convert_file_to_txt(
|
|
765 |
logging.debug(f"DEBUG CONVERT_TO_TXT: HTML après clean_html_content : {cleaned_html[:500]}...")
|
766 |
logging.debug(f"DEBUG CONVERT_TO_TXT: images_data après clean_html_content : {images_data}")
|
767 |
|
|
|
|
|
|
|
768 |
# Vérification des commentaires IMG_X dans le HTML nettoyé
|
769 |
cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
|
770 |
has_img_comments = any(re.match(r'IMG_\d+', str(comment)) for comment in cleaned_soup.find_all(string=lambda text: isinstance(text, Comment)))
|
|
|
372 |
|
373 |
async def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> str:
|
374 |
soup = BeautifulSoup(html_content, 'html.parser')
|
375 |
+
logging.debug(f"DEBUG CLEAN_HTML: Début de clean_html_content")
|
376 |
|
377 |
for tag in soup.find_all():
|
378 |
if 'style' in tag.attrs:
|
|
|
385 |
span.unwrap()
|
386 |
|
387 |
img_tags = soup.find_all('img')
|
388 |
+
logging.debug(f"DEBUG CLEAN_HTML: Nombre de balises <img> trouvées : {len(img_tags)}")
|
389 |
+
|
390 |
if img_tags:
|
391 |
if len(img_tags) > 20:
|
392 |
logging.warning(f"Number of images ({len(img_tags)}) exceeds 20. Images will be ignored.")
|
|
|
395 |
else:
|
396 |
for img in img_tags:
|
397 |
src = img.get('src', '')
|
398 |
+
logging.debug(f"DEBUG CLEAN_HTML: Traitement de la balise <img> avec src='{src[:100]}...'") # Afficher le début du src
|
399 |
X = image_counter[0]
|
400 |
if src.startswith('data:image/'):
|
401 |
+
logging.debug(f"DEBUG CLEAN_HTML: src commence par data:image/")
|
402 |
base64_image = encode_image_from_data_uri(src)
|
403 |
if base64_image:
|
404 |
images_data[f"IMG_{X}"] = {
|
405 |
'base64_image': base64_image
|
406 |
}
|
407 |
placeholder = f"<!--IMG_{X}-->"
|
408 |
+
logging.debug(f"DEBUG CLEAN_HTML: Remplacement par le commentaire : {placeholder}")
|
409 |
img.replace_with(BeautifulSoup(placeholder, 'html.parser'))
|
410 |
image_counter[0] += 1
|
411 |
else:
|
412 |
+
logging.debug(f"DEBUG CLEAN_HTML: Erreur lors de l'encodage base64, suppression de l'image.")
|
413 |
img.decompose()
|
414 |
else:
|
415 |
+
logging.debug(f"DEBUG CLEAN_HTML: src ne commence PAS par data:image/, suppression de l'image.")
|
416 |
img.decompose()
|
417 |
else:
|
418 |
+
logging.debug("DEBUG CLEAN_HTML: Aucune balise <img> trouvée dans le contenu HTML.")
|
419 |
|
420 |
for img in soup.find_all('img'):
|
421 |
img.decompose()
|
|
|
430 |
if not tag.get_text(strip=True):
|
431 |
tag.decompose()
|
432 |
|
433 |
+
logging.debug(f"DEBUG CLEAN_HTML: Fin de clean_html_content")
|
434 |
return str(soup)
|
435 |
|
436 |
def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
|
|
|
620 |
html_content = convert_with_pandoc(input_file_path, input_format)
|
621 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
622 |
|
623 |
+
if ext == '.docx':
|
624 |
+
logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
625 |
+
|
626 |
# Nettoyage et extraction des images
|
627 |
image_counter = [1]
|
628 |
images_data = {}
|
|
|
631 |
logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html[:500]}...")
|
632 |
logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
|
633 |
|
634 |
+
if ext == '.docx':
|
635 |
+
logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
|
636 |
|
637 |
# Décrire les images
|
638 |
for image_key in images_data:
|
|
|
770 |
|
771 |
if ext == '.docx':
|
772 |
logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
773 |
+
|
774 |
# Nettoyage et extraction des images
|
775 |
image_counter = [1]
|
776 |
images_data = {}
|
|
|
779 |
logging.debug(f"DEBUG CONVERT_TO_TXT: HTML après clean_html_content : {cleaned_html[:500]}...")
|
780 |
logging.debug(f"DEBUG CONVERT_TO_TXT: images_data après clean_html_content : {images_data}")
|
781 |
|
782 |
+
if ext == '.docx':
|
783 |
+
logging.debug(f"DEBUG CONVERT_TO_TXT (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
|
784 |
+
|
785 |
# Vérification des commentaires IMG_X dans le HTML nettoyé
|
786 |
cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
|
787 |
has_img_comments = any(re.match(r'IMG_\d+', str(comment)) for comment in cleaned_soup.find_all(string=lambda text: isinstance(text, Comment)))
|