accessibility

Sleeping

Bentham commited on Dec 3, 2024

Commit

3bcaa2d

verified ·

1 Parent(s): c6372cf

readability

Files changed (1) hide show

main.py CHANGED Viewed

@@ -14,7 +14,7 @@ import json
 import asyncio  # Added for asynchronous functionality
 from openai import AsyncOpenAI  # Import AsyncOpenAI
-from newspaper import Article
 import instructor  # Import instructor for patching
@@ -156,19 +156,16 @@ async def convert_to_accessible_html(input_filename, ext, base_filename, image_c
             with open(input_filename, 'r', encoding='utf-8') as f:
                 html_content = f.read()
             try:
-                # Utiliser newspaper3k pour extraire le texte principal
-                article = Article(url='')
-                article.set_html(html_content)
-                article.parse()
-                # Obtenir le texte principal
-                main_text = article.text
                 # Reconstruire le HTML avec le texte principal
                 html_content = f"<html><body><p>{main_text}</p></body></html>"
-                logging.debug("Contenu HTML nettoyé avec newspaper3k.")
             except Exception as e:
-                logging.error(f"Erreur lors du nettoyage avec newspaper3k : {str(e)}")
                 # Vous pouvez décider de continuer avec le contenu HTML original ou arrêter le traitement
                 return None  # Ou continuez avec html_content non modifié
         # Conversion from PDF to HTML with PyMuPDF

 import asyncio  # Added for asynchronous functionality
 from openai import AsyncOpenAI  # Import AsyncOpenAI
+from readability import Document
 import instructor  # Import instructor for patching
             with open(input_filename, 'r', encoding='utf-8') as f:
                 html_content = f.read()
             try:
+                # Utiliser readability-lxml pour extraire le contenu principal
+                doc = Document(html_content)
+                main_html = doc.summary()  # Extrait le HTML principal
+                main_text = doc.text()      # Extrait le texte principal
                 # Reconstruire le HTML avec le texte principal
                 html_content = f"<html><body><p>{main_text}</p></body></html>"
+                logging.debug("Contenu HTML nettoyé avec readability-lxml.")
             except Exception as e:
+                logging.error(f"Erreur lors du nettoyage avec readability-lxml : {str(e)}")
                 # Vous pouvez décider de continuer avec le contenu HTML original ou arrêter le traitement
                 return None  # Ou continuez avec html_content non modifié
         # Conversion from PDF to HTML with PyMuPDF