accessibility

Sleeping

App Files Files Community

Bentham commited on Dec 4, 2024

Commit

d26091d

verified ·

1 Parent(s): a1a2e31

nettoyage des html dans transformation en txt

Browse files

Files changed (1) hide show

main.py +75 -24

main.py CHANGED Viewed

@@ -687,6 +687,30 @@ async def clean_html_content(html_content: str, image_counter: List[int], images
     # Return the cleaned HTML as a string
     return str(soup)
 def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
     """Function to re-integrate images and their descriptions into the final HTML code."""
@@ -803,11 +827,15 @@ async def get_result(job_id: str):
 def delete_temp_files(file_paths: list):
-    """Function to delete temporary files after the response"""
     for file_path in file_paths:
-        if os.path.exists(file_path):
-            os.remove(file_path)
-            logging.debug(f"Temporary file deleted: {file_path}")
 @app.post("/convert_to_txt/")
 async def convert_file_to_txt(
@@ -815,32 +843,42 @@ async def convert_file_to_txt(
     background_tasks: BackgroundTasks = BackgroundTasks()
 ):
     try:
-        # Original file name and extension
         original_filename = file.filename
         base_filename, ext = os.path.splitext(original_filename)
         ext = ext.lower()
-        # Allowed extensions for conversion
         allowed_extensions = [
             '.odt', '.pdf', '.docx', '.html', '.htm', '.md', '.txt', '.rtf', '.epub',
             '.tex', '.xml', '.org', '.commonmark', '.cm', '.wiki', '.opml'
         ]
         if ext not in allowed_extensions:
-            raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
-        # Create a temporary input file with the correct extension
         with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as input_tmp_file:
             input_filename = input_tmp_file.name
             with open(input_filename, "wb") as f:
                 shutil.copyfileobj(file.file, f)
-            logging.debug(f"Uploaded file: {input_filename}")
-        # Define the output file name, keeping the same base name but with .txt extension
         unique_id = uuid.uuid4().hex
         output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
-        # PDF to text conversion using PyMuPDF
         if ext == '.pdf':
             text = ""
             with fitz.open(input_filename) as doc:
@@ -848,27 +886,40 @@ async def convert_file_to_txt(
                     text += page.get_text()
             with open(output_filename, "w", encoding="utf-8") as f:
                 f.write(text)
-            logging.debug(f"PDF conversion successful with PyMuPDF: {output_filename}")
-        # Other file formats to text conversion using Pandoc
         else:
             output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
-            logging.debug(f"Conversion successful: {output_filename}")
-        # Check if the .txt file exists
         if not os.path.exists(output_filename):
-            logging.error(f"The file {output_filename} was not generated.")
-            raise HTTPException(status_code=500, detail="Error during conversion.")
-        # Add temporary files to background task for deletion after sending the response
-        background_tasks.add_task(delete_temp_files, [input_filename, output_filename])
-        # Return the converted file to the client, with the same base name and .txt extension
         return FileResponse(output_filename, filename=f"{base_filename}.txt")
     except HTTPException as http_exc:
-        logging.error(f"HTTP error during conversion: {str(http_exc.detail)}")
         return JSONResponse(status_code=http_exc.status_code, content={"message": http_exc.detail})
     except Exception as e:
-        logging.error(f"Error during conversion: {str(e)}")
-        return JSONResponse(status_code=500, content={"message": f"Internal error: {str(e)}"})

     # Return the cleaned HTML as a string
     return str(soup)
+def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
+    """
+    Nettoie le contenu HTML en utilisant readability-lxml et écrit le contenu nettoyé dans un nouveau fichier.
+    Args:
+        input_filepath (str): Chemin vers le fichier HTML original.
+        cleaned_output_filepath (str): Chemin vers le fichier HTML nettoyé.
+    Returns:
+        bool: True si le nettoyage a réussi, False sinon.
+    """
+    try:
+        with open(input_filepath, 'r', encoding='utf-8') as f:
+            html_content = f.read()
+        doc = Document(html_content)
+        cleaned_html = doc.summary()  # Extrait le HTML principal
+        with open(cleaned_output_filepath, 'w', encoding='utf-8') as f:
+            f.write(cleaned_html)
+        logging.debug("Contenu HTML nettoyé avec readability-lxml.")
+        return True
+    except Exception as e:
+        logging.error(f"Erreur lors du nettoyage du fichier HTML {input_filepath} : {str(e)}")
+        return False
 def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
     """Function to re-integrate images and their descriptions into the final HTML code."""
 def delete_temp_files(file_paths: list):
+    """Fonction pour supprimer les fichiers temporaires après la réponse."""
     for file_path in file_paths:
+        try:
+            if os.path.exists(file_path):
+                os.remove(file_path)
+                logging.debug(f"Fichier temporaire supprimé : {file_path}")
+        except Exception as e:
+            logging.error(f"Erreur lors de la suppression du fichier {file_path} : {str(e)}")
 @app.post("/convert_to_txt/")
 async def convert_file_to_txt(
     background_tasks: BackgroundTasks = BackgroundTasks()
 ):
     try:
+        # Nom de fichier original et extension
         original_filename = file.filename
         base_filename, ext = os.path.splitext(original_filename)
         ext = ext.lower()
+        # Extensions autorisées pour la conversion
         allowed_extensions = [
             '.odt', '.pdf', '.docx', '.html', '.htm', '.md', '.txt', '.rtf', '.epub',
             '.tex', '.xml', '.org', '.commonmark', '.cm', '.wiki', '.opml'
         ]
         if ext not in allowed_extensions:
+            raise HTTPException(status_code=400, detail=f"Extension de fichier non supportée : {ext}")
+        # Créer un fichier temporaire d'entrée avec la bonne extension
         with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as input_tmp_file:
             input_filename = input_tmp_file.name
             with open(input_filename, "wb") as f:
                 shutil.copyfileobj(file.file, f)
+            logging.debug(f"Fichier téléchargé enregistré : {input_filename}")
+        # Si le fichier est HTML ou HTM, effectuer le nettoyage
+        if ext in ['.html', '.htm']:
+            cleaned_input_filename = input_filename + '_cleaned.html'
+            nettoyage_reussi = clean_html_file(input_filename, cleaned_input_filename)
+            if not nettoyage_reussi:
+                raise HTTPException(status_code=500, detail="Erreur lors du nettoyage du fichier HTML.")
+            # Utiliser le fichier nettoyé pour la conversion
+            input_filename = cleaned_input_filename
+            logging.debug(f"Fichier HTML nettoyé enregistré : {input_filename}")
+        # Définir le nom du fichier de sortie, en conservant le même nom de base mais avec l'extension .txt
         unique_id = uuid.uuid4().hex
         output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
+        # Conversion de PDF en texte avec PyMuPDF
         if ext == '.pdf':
             text = ""
             with fitz.open(input_filename) as doc:
                     text += page.get_text()
             with open(output_filename, "w", encoding="utf-8") as f:
                 f.write(text)
+            logging.debug(f"Conversion PDF réussie avec PyMuPDF : {output_filename}")
+        # Conversion des autres formats en texte avec Pandoc
         else:
+            # Pour Markdown, assurez-vous que le contenu est nettoyé si nécessaire
+            if ext in ['.md', '.markdown']:
+                with open(input_filename, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                # Optionnel : appliquer des nettoyages spécifiques au Markdown
+                with open(input_filename, 'w', encoding='utf-8') as f:
+                    f.write(content)  # Ici, aucun nettoyage spécifique
             output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
+            logging.debug(f"Conversion réussie avec Pandoc : {output_filename}")
+        # Vérifier si le fichier .txt existe
         if not os.path.exists(output_filename):
+            logging.error(f"Le fichier {output_filename} n'a pas été généré.")
+            raise HTTPException(status_code=500, detail="Erreur lors de la conversion.")
+        # Ajouter les fichiers temporaires à la tâche d'arrière-plan pour suppression après l'envoi de la réponse
+        # Inclure le fichier nettoyé s'il existe
+        temp_files_to_delete = [input_filename, output_filename]
+        if ext in ['.html', '.htm']:
+            temp_files_to_delete.append(cleaned_input_filename)
+        background_tasks.add_task(delete_temp_files, temp_files_to_delete)
+        # Retourner le fichier converti au client, avec le même nom de base et l'extension .txt
         return FileResponse(output_filename, filename=f"{base_filename}.txt")
     except HTTPException as http_exc:
+        logging.error(f"Erreur HTTP lors de la conversion : {str(http_exc.detail)}")
         return JSONResponse(status_code=http_exc.status_code, content={"message": http_exc.detail})
     except Exception as e:
+        logging.error(f"Erreur interne lors de la conversion : {str(e)}")
+        return JSONResponse(status_code=500, content={"message": f"Erreur interne : {str(e)}"})