Spaces:
Sleeping
Sleeping
bug convert to txt
Browse files
main.py
CHANGED
@@ -805,8 +805,10 @@ async def convert_file_to_txt(
|
|
805 |
output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
|
806 |
|
807 |
text = ""
|
808 |
-
|
809 |
-
|
|
|
|
|
810 |
if ext == '.pdf':
|
811 |
text, images = extract_text_with_image_markers(input_filename)
|
812 |
elif ext == '.pptx':
|
@@ -838,10 +840,8 @@ async def convert_file_to_txt(
|
|
838 |
text = f.read()
|
839 |
images = []
|
840 |
|
841 |
-
# Analyse des images
|
842 |
-
# On récupère les descriptions des images
|
843 |
if images:
|
844 |
-
image_descriptions = {}
|
845 |
tasks = []
|
846 |
for img_num, img_bytes in images:
|
847 |
base64_image = base64.b64encode(img_bytes).decode('utf-8')
|
@@ -852,10 +852,13 @@ async def convert_file_to_txt(
|
|
852 |
)
|
853 |
))
|
854 |
|
855 |
-
descriptions = await asyncio.gather(*tasks)
|
856 |
|
857 |
for (img_num, _), desc in zip(images, descriptions):
|
858 |
-
if desc
|
|
|
|
|
|
|
859 |
image_descriptions[img_num] = desc
|
860 |
else:
|
861 |
image_descriptions[img_num] = "Description indisponible."
|
@@ -865,8 +868,8 @@ async def convert_file_to_txt(
|
|
865 |
marker = f"[IMG_{img_num}]"
|
866 |
description_text = f"Image {img_num}: {desc}"
|
867 |
text = text.replace(marker, description_text)
|
868 |
-
# MODIFICATIONS END
|
869 |
|
|
|
870 |
with open(output_filename, "w", encoding="utf-8") as f:
|
871 |
f.write(text)
|
872 |
|
@@ -879,4 +882,11 @@ async def convert_file_to_txt(
|
|
879 |
temp_files_to_delete.append(cleaned_input_filename)
|
880 |
background_tasks.add_task(delete_temp_files, temp_files_to_delete)
|
881 |
|
882 |
-
return FileResponse(output_filename, filename=f"{base_filename}.txt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
805 |
output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
|
806 |
|
807 |
text = ""
|
808 |
+
images = []
|
809 |
+
image_descriptions = {}
|
810 |
+
|
811 |
+
# Extraction du texte et des images
|
812 |
if ext == '.pdf':
|
813 |
text, images = extract_text_with_image_markers(input_filename)
|
814 |
elif ext == '.pptx':
|
|
|
840 |
text = f.read()
|
841 |
images = []
|
842 |
|
843 |
+
# Analyse des images et récupération des descriptions
|
|
|
844 |
if images:
|
|
|
845 |
tasks = []
|
846 |
for img_num, img_bytes in images:
|
847 |
base64_image = base64.b64encode(img_bytes).decode('utf-8')
|
|
|
852 |
)
|
853 |
))
|
854 |
|
855 |
+
descriptions = await asyncio.gather(*tasks, return_exceptions=True)
|
856 |
|
857 |
for (img_num, _), desc in zip(images, descriptions):
|
858 |
+
if isinstance(desc, Exception):
|
859 |
+
logging.error(f"Erreur lors de la description de l'image {img_num} : {str(desc)}")
|
860 |
+
image_descriptions[img_num] = "Description indisponible."
|
861 |
+
elif desc and desc != "Description indisponible.":
|
862 |
image_descriptions[img_num] = desc
|
863 |
else:
|
864 |
image_descriptions[img_num] = "Description indisponible."
|
|
|
868 |
marker = f"[IMG_{img_num}]"
|
869 |
description_text = f"Image {img_num}: {desc}"
|
870 |
text = text.replace(marker, description_text)
|
|
|
871 |
|
872 |
+
# Écriture du texte dans le fichier de sortie
|
873 |
with open(output_filename, "w", encoding="utf-8") as f:
|
874 |
f.write(text)
|
875 |
|
|
|
882 |
temp_files_to_delete.append(cleaned_input_filename)
|
883 |
background_tasks.add_task(delete_temp_files, temp_files_to_delete)
|
884 |
|
885 |
+
return FileResponse(output_filename, filename=f"{base_filename}.txt")
|
886 |
+
|
887 |
+
except HTTPException as http_exc:
|
888 |
+
logging.error(f"Erreur HTTP lors de la conversion : {str(http_exc.detail)}")
|
889 |
+
return JSONResponse(status_code=http_exc.status_code, content={"message": http_exc.detail})
|
890 |
+
except Exception as e:
|
891 |
+
logging.error(f"Erreur interne lors de la conversion : {str(e)}")
|
892 |
+
return JSONResponse(status_code=500, content={"message": f"Erreur interne : {str(e)}"})
|