convert_test

Sleeping

App Files Files Community

Bentham commited on Dec 20, 2024

Commit

881bf2e

verified ·

1 Parent(s): f3ad28d

Update main.py

Browse files

Files changed (1) hide show

main.py +95 -108

main.py CHANGED Viewed

@@ -355,15 +355,15 @@ def convert_ppt_to_text(input_filename: str) -> str:
                 if image_extension in ['png', 'jpeg', 'jpg', 'gif']:
                     mime_type = f'image/{image_extension}'
                 else:
-                    mime_type = 'image/png'  # Default if extension is unknown
                 data_uri = f'data:{mime_type};base64,{image_base64}'
-                # Add an img tag with the data URI; consider adding alt text if available
-                alt_text = shape.name  # Or some other logic to determine alt text
                 img_tag = f'<img src="{data_uri}" alt="{alt_text}">'
                 text_content.append(img_tag)
-                text_content.append(shape.text)
     return "\n".join(text_content)
 def convert_doc_to_text(input_filename: str) -> str:
     if 'textract' not in globals():
         raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
@@ -592,110 +592,97 @@ async def convert_file_to_html(
         logging.error(f"Erreur lors du démarrage du job : {str(e)}")
         return JSONResponse(status_code=500, content={"message": f"Erreur lors du démarrage du job : {str(e)}"})
-async def process_file_accessibility(job_id: str, input_file_path: str, ext: str, original_filename: str):
-    job_dir = os.path.join(JOBS_DIR, job_id)
-    try:
-        update_job_status(job_id, 'processing', 'Le fichier est en cours de traitement')
-        base_filename = os.path.splitext(original_filename)[0]
-        # Conversion en HTML sans extraire les images à ce stade
-        if ext == '.pdf':
-            html_content = pdf_to_html(input_file_path)
-        elif ext in ['.ppt', '.pptx']:
-            text = convert_ppt_to_text(input_file_path)
-            html_content = text_to_html(text)
-            html_content = insert_page_comments_every_15_paragraphs(html_content)
-        elif ext == '.doc':
-            text = convert_doc_to_text(input_file_path)
-            html_content = text_to_html(text)
-            html_content = insert_page_comments_every_15_paragraphs(html_content)
-            logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
-    elif ext in ['.html', '.htm']:
-        with open(input_file_path, 'r', encoding='utf-8') as f:
-            raw_html = f.read()
-        elif ext in ['.html', '.htm']:
-            with open(input_file_path, 'r', encoding='utf-8') as f:
-                raw_html = f.read()
-            try:
-                doc = Document(raw_html)
-                cleaned = doc.summary()
-                html_content = cleaned
-            except Exception as e:
-                logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
-                html_content = raw_html
-            html_content = insert_page_comments_every_15_paragraphs(html_content)
-        else:
-            input_format = get_pandoc_format(ext)
-            html_content = convert_with_pandoc(input_file_path, input_format)
-            html_content = insert_page_comments_every_15_paragraphs(html_content)
-        if ext == '.docx':
-            logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
-        # Nettoyage et extraction des images
-        image_counter = [1]
-        images_data = {}
-        logging.debug(f"DEBUG ACCESSIBILITY: HTML avant clean_html_content : {html_content[:500]}...")
-        cleaned_html = await clean_html_content(html_content, image_counter, images_data)
-        logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html}...")
-        logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
-        if ext == '.docx':
-            logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
-        image_counter = [1]
-        images_data = {}
-        # Décrire les images
-        for image_key in images_data:
-            context = get_context_for_image(cleaned_html, image_key)
-            prompt = (
-                "Décris ce que l'on peut voir sur cette image, pour qu'un lecteur malvoyant puisse comprendre ce qu'elle représente."
-                "\nJe vais maintenant te donner les dernières phrases qui précèdent cette image. "
-                "Prends en compte ce contexte pour l'interpréter :\n\"" + context + "\""
-            )
-            images_data[image_key]['prompt'] = prompt
-        tasks = []
-        for image_key in images_data:
-            base64_image = images_data[image_key]['base64_image']
-            prompt = images_data[image_key]['prompt']
-            tasks.append((image_key, asyncio.create_task(get_image_description(base64_image, prompt))))
-        results = await asyncio.gather(*(t[1] for t in tasks))
-        for (image_key, _), description in zip(tasks, results):
-            images_data[image_key]['description'] = description
-        # Réécriture accessible
-        rewritten_html = await rewrite_html_accessible(cleaned_html)
-        # Réinsertion des images
-        final_html = reinsert_images(rewritten_html, images_data)
-        # Nettoyage final
-        final_soup = BeautifulSoup(final_html, 'html.parser')
-        scripts_to_remove = final_soup.find_all('script', src=True)
-        for script in scripts_to_remove:
-            if script['src'].startswith('https://bentham-converttohtml.hf.space/'):
-                script.decompose()
-        final_html = str(final_soup)
-        # Supprimer lignes contenant ```
-        final_html = re.sub(r'^\s*```(?:html)?\s*$', '', final_html, flags=re.MULTILINE)
-        # Insérer le CSS
-        final_html = insert_css_into_html(final_html)
-        output_filename = os.path.join(job_dir, f"{base_filename}.html")
-        with open(output_filename, 'w', encoding='utf-8') as f:
-            f.write(final_html)
-        update_job_status(job_id, 'completed', 'Traitement terminé', result_file=f"{base_filename}.html")
-        delete_files_after_delay([input_file_path], delay=6000)
-    except Exception as e:
-        logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
-        update_job_status(job_id, 'error', f"Erreur: {str(e)}")
 @app.get("/status/{job_id}")

                 if image_extension in ['png', 'jpeg', 'jpg', 'gif']:
                     mime_type = f'image/{image_extension}'
                 else:
+                    mime_type = 'image/png'  # Par défaut si l'extension est inconnue
                 data_uri = f'data:{mime_type};base64,{image_base64}'
+                # Ajouter une balise img avec le data URI ; inclure un alt si disponible
+                alt_text = shape.name  # Ou une logique différente pour déterminer l'alt text
                 img_tag = f'<img src="{data_uri}" alt="{alt_text}">'
                 text_content.append(img_tag)
     return "\n".join(text_content)
 def convert_doc_to_text(input_filename: str) -> str:
     if 'textract' not in globals():
         raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
         logging.error(f"Erreur lors du démarrage du job : {str(e)}")
         return JSONResponse(status_code=500, content={"message": f"Erreur lors du démarrage du job : {str(e)}"})
+        async def process_file_accessibility(job_id: str, input_file_path: str, ext: str, original_filename: str):
+        job_dir = os.path.join(JOBS_DIR, job_id)
+        try:
+            update_job_status(job_id, 'processing', 'Le fichier est en cours de traitement')
+            base_filename = os.path.splitext(original_filename)[0]
+            # Conversion en HTML sans extraire les images à ce stade
+            if ext == '.pdf':
+                html_content = pdf_to_html(input_file_path)
+            elif ext in ['.ppt', '.pptx']:
+                text = convert_ppt_to_text(input_file_path)
+                html_content = text_to_html(text)
+            elif ext == '.doc':
+                text = convert_doc_to_text(input_file_path)
+                html_content = text_to_html(text)
+            elif ext in ['.html', '.htm']:
+                with open(input_file_path, 'r', encoding='utf-8') as f:
+                    raw_html = f.read()
+                try:
+                    doc = Document(raw_html)
+                    cleaned = doc.summary()
+                    html_content = cleaned
+                except Exception as e:
+                    logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
+                    html_content = raw_html
+            else:
+                input_format = get_pandoc_format(ext)
+                html_content = convert_with_pandoc(input_file_path, input_format)
+            if ext == '.docx':
+                logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
+            # Nettoyage et extraction des images
+            image_counter = [1]
+            images_data = {}
+            logging.debug(f"DEBUG ACCESSIBILITY: HTML avant clean_html_content : {html_content[:500]}...")
+            cleaned_html = await clean_html_content(html_content, image_counter, images_data)
+            logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html}...")
+            logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
+            # Décrire les images
+            for image_key in images_data:
+                context = get_context_for_image(cleaned_html, image_key)
+                prompt = (
+                    "Décris ce que l'on peut voir sur cette image, pour qu'un lecteur malvoyant puisse comprendre ce qu'elle représente."
+                    "\nJe vais maintenant te donner les dernières phrases qui précèdent cette image. "
+                    "Prends en compte ce contexte pour l'interpréter :\n\"" + context + "\""
+                )
+                images_data[image_key]['prompt'] = prompt
+            tasks = []
+            for image_key in images_data:
+                base64_image = images_data[image_key]['base64_image']
+                prompt = images_data[image_key]['prompt']
+                tasks.append((image_key, asyncio.create_task(get_image_description(base64_image, prompt))))
+            results = await asyncio.gather(*(t[1] for t in tasks))
+            for (image_key, _), description in zip(tasks, results):
+                images_data[image_key]['description'] = description
+            # Réécriture accessible
+            rewritten_html = await rewrite_html_accessible(cleaned_html)
+            # Réinsertion des images
+            final_html = reinsert_images(rewritten_html, images_data)
+            # Nettoyage final
+            final_soup = BeautifulSoup(final_html, 'html.parser')
+            scripts_to_remove = final_soup.find_all('script', src=True)
+            for script in scripts_to_remove:
+                if script['src'].startswith('https://bentham-converttohtml.hf.space/'):
+                    script.decompose()
+            final_html = str(final_soup)
+            # Supprimer lignes contenant ```
+            final_html = re.sub(r'^\s*```(?:html)?\s*$', '', final_html, flags=re.MULTILINE)
+            # Insérer le CSS
+            final_html = insert_css_into_html(final_html)
+            output_filename = os.path.join(job_dir, f"{base_filename}.html")
+            with open(output_filename, 'w', encoding='utf-8') as f:
+                f.write(final_html)
+            update_job_status(job_id, 'completed', 'Traitement terminé', result_file=f"{base_filename}.html")
+            delete_files_after_delay([input_file_path], delay=6000)
+        except Exception as e:
+            logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
+            update_job_status(job_id, 'error', f"Erreur: {str(e)}")
 @app.get("/status/{job_id}")