Spaces:
Sleeping
Sleeping
ppt pypandoc
Browse files
main.py
CHANGED
@@ -785,13 +785,11 @@ async def convert_file_to_txt(
|
|
785 |
elif ext == '.pptx':
|
786 |
html_content = convert_pptx_to_html(input_filename)
|
787 |
elif ext == '.ppt':
|
|
|
788 |
try:
|
789 |
-
|
790 |
-
html_content = text_to_html(text)
|
791 |
-
except HTTPException as e:
|
792 |
-
raise e
|
793 |
except Exception as e:
|
794 |
-
logging.error(f"Erreur lors de la conversion de .ppt avec
|
795 |
raise HTTPException(status_code=500, detail=f"Erreur lors de la conversion du fichier .ppt: {e}")
|
796 |
elif ext == '.doc':
|
797 |
text = convert_doc_to_text(input_filename)
|
@@ -808,18 +806,18 @@ async def convert_file_to_txt(
|
|
808 |
# Nettoyage et extraction des images
|
809 |
image_counter = [1]
|
810 |
images_data = {}
|
811 |
-
logging.debug(f"DEBUG
|
812 |
cleaned_html = await clean_html_content(html_content, image_counter, images_data)
|
813 |
-
logging.debug(f"DEBUG
|
814 |
-
logging.debug(f"DEBUG
|
815 |
|
816 |
if ext == '.docx':
|
817 |
-
logging.debug(f"DEBUG
|
818 |
|
819 |
# Vérification des commentaires IMG_X dans le HTML nettoyé
|
820 |
cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
|
821 |
has_img_comments = any(re.match(r'IMG_\d+', str(comment)) for comment in cleaned_soup.find_all(string=lambda text: isinstance(text, Comment)))
|
822 |
-
logging.debug(f"DEBUG
|
823 |
|
824 |
# Description des images pour le mode texte
|
825 |
for image_key in images_data:
|
@@ -847,15 +845,15 @@ async def convert_file_to_txt(
|
|
847 |
images_data[image_key]['description'] = description
|
848 |
|
849 |
# Réinsertion des images avec descriptions
|
850 |
-
logging.debug(f"DEBUG
|
851 |
-
logging.debug(f"DEBUG
|
852 |
final_html = reinsert_images(cleaned_html, images_data)
|
853 |
-
logging.debug(f"DEBUG
|
854 |
|
855 |
# Vérification des descriptions insérées
|
856 |
soup_final = BeautifulSoup(final_html, 'html.parser')
|
857 |
description_paragraphs = soup_final.find_all('p', class_='description')
|
858 |
-
logging.debug(f"DEBUG
|
859 |
for desc in description_paragraphs:
|
860 |
logging.debug(f"Description insérée : {desc.get_text(strip=True)}")
|
861 |
|
|
|
785 |
elif ext == '.pptx':
|
786 |
html_content = convert_pptx_to_html(input_filename)
|
787 |
elif ext == '.ppt':
|
788 |
+
input_format = get_pandoc_format(ext)
|
789 |
try:
|
790 |
+
html_content = convert_with_pandoc(input_filename, input_format)
|
|
|
|
|
|
|
791 |
except Exception as e:
|
792 |
+
logging.error(f"Erreur lors de la conversion de .ppt avec pypandoc: {e}")
|
793 |
raise HTTPException(status_code=500, detail=f"Erreur lors de la conversion du fichier .ppt: {e}")
|
794 |
elif ext == '.doc':
|
795 |
text = convert_doc_to_text(input_filename)
|
|
|
806 |
# Nettoyage et extraction des images
|
807 |
image_counter = [1]
|
808 |
images_data = {}
|
809 |
+
logging.debug(f"DEBUG CONVERT_TO_txt: HTML avant clean_html_content : {html_content[:500]}...")
|
810 |
cleaned_html = await clean_html_content(html_content, image_counter, images_data)
|
811 |
+
logging.debug(f"DEBUG CONVERT_TO_txt: HTML après clean_html_content : {cleaned_html}...")
|
812 |
+
logging.debug(f"DEBUG CONVERT_TO_txt: images_data après clean_html_content : {images_data}")
|
813 |
|
814 |
if ext == '.docx':
|
815 |
+
logging.debug(f"DEBUG CONVERT_TO_txt (.docx): HTML après clean_html_content (complet) : {cleaned_html}") # Afficher le HTML complet
|
816 |
|
817 |
# Vérification des commentaires IMG_X dans le HTML nettoyé
|
818 |
cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
|
819 |
has_img_comments = any(re.match(r'IMG_\d+', str(comment)) for comment in cleaned_soup.find_all(string=lambda text: isinstance(text, Comment)))
|
820 |
+
logging.debug(f"DEBUG CONVERT_TO_txt: Présence de commentaires IMG_X après nettoyage : {has_img_comments}")
|
821 |
|
822 |
# Description des images pour le mode texte
|
823 |
for image_key in images_data:
|
|
|
845 |
images_data[image_key]['description'] = description
|
846 |
|
847 |
# Réinsertion des images avec descriptions
|
848 |
+
logging.debug(f"DEBUG CONVERT_TO_txt: HTML avant reinsert_images : {cleaned_html[:500]}...")
|
849 |
+
logging.debug(f"DEBUG CONVERT_TO_txt: images_data avant reinsert_images : {images_data}")
|
850 |
final_html = reinsert_images(cleaned_html, images_data)
|
851 |
+
logging.debug(f"DEBUG CONVERT_TO_txt: HTML après reinsert_images : {final_html[:500]}...")
|
852 |
|
853 |
# Vérification des descriptions insérées
|
854 |
soup_final = BeautifulSoup(final_html, 'html.parser')
|
855 |
description_paragraphs = soup_final.find_all('p', class_='description')
|
856 |
+
logging.debug(f"DEBUG CONVERT_TO_txt: Nombre de descriptions insérées : {len(description_paragraphs)}")
|
857 |
for desc in description_paragraphs:
|
858 |
logging.debug(f"Description insérée : {desc.get_text(strip=True)}")
|
859 |
|