accessibility

Sleeping

Bentham commited on Dec 3, 2024

Commit

f8a0705

verified ·

1 Parent(s): cbb8da7

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -156,17 +156,13 @@ async def convert_to_accessible_html(input_filename, ext, base_filename, image_c
             with open(input_filename, 'r', encoding='utf-8') as f:
                 html_content = f.read()
             try:
-                # Utiliser readability-lxml pour extraire le contenu principal
                 doc = Document(html_content)
                 main_html = doc.summary()  # Extrait le HTML principal
-                main_text = doc.text()      # Extrait le texte principal
-                # Reconstruire le HTML avec le texte principal
                 html_content = f"<html><body><p>{main_text}</p></body></html>"
                 logging.debug("Contenu HTML nettoyé avec readability-lxml.")
             except Exception as e:
                 logging.error(f"Erreur lors du nettoyage avec readability-lxml : {str(e)}")
-                # Vous pouvez décider de continuer avec le contenu HTML original ou arrêter le traitement
                 return None  # Ou continuez avec html_content non modifié
         # Conversion from PDF to HTML with PyMuPDF
         elif ext == '.pdf':

             with open(input_filename, 'r', encoding='utf-8') as f:
                 html_content = f.read()
             try:
                 doc = Document(html_content)
                 main_html = doc.summary()  # Extrait le HTML principal
+                main_text = doc.content()  # Extrait le contenu texte complet
                 html_content = f"<html><body><p>{main_text}</p></body></html>"
                 logging.debug("Contenu HTML nettoyé avec readability-lxml.")
             except Exception as e:
                 logging.error(f"Erreur lors du nettoyage avec readability-lxml : {str(e)}")
                 return None  # Ou continuez avec html_content non modifié
         # Conversion from PDF to HTML with PyMuPDF
         elif ext == '.pdf':