Bentham commited on
Commit
af461e5
·
verified ·
1 Parent(s): 76171bc

bug convert to txt

Browse files
Files changed (1) hide show
  1. main.py +19 -9
main.py CHANGED
@@ -805,8 +805,10 @@ async def convert_file_to_txt(
805
  output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
806
 
807
  text = ""
808
- # MODIFICATIONS START: Extraction du texte et des images
809
- images_data = {}
 
 
810
  if ext == '.pdf':
811
  text, images = extract_text_with_image_markers(input_filename)
812
  elif ext == '.pptx':
@@ -838,10 +840,8 @@ async def convert_file_to_txt(
838
  text = f.read()
839
  images = []
840
 
841
- # Analyse des images
842
- # On récupère les descriptions des images
843
  if images:
844
- image_descriptions = {}
845
  tasks = []
846
  for img_num, img_bytes in images:
847
  base64_image = base64.b64encode(img_bytes).decode('utf-8')
@@ -852,10 +852,13 @@ async def convert_file_to_txt(
852
  )
853
  ))
854
 
855
- descriptions = await asyncio.gather(*tasks)
856
 
857
  for (img_num, _), desc in zip(images, descriptions):
858
- if desc and desc != "Description indisponible.":
 
 
 
859
  image_descriptions[img_num] = desc
860
  else:
861
  image_descriptions[img_num] = "Description indisponible."
@@ -865,8 +868,8 @@ async def convert_file_to_txt(
865
  marker = f"[IMG_{img_num}]"
866
  description_text = f"Image {img_num}: {desc}"
867
  text = text.replace(marker, description_text)
868
- # MODIFICATIONS END
869
 
 
870
  with open(output_filename, "w", encoding="utf-8") as f:
871
  f.write(text)
872
 
@@ -879,4 +882,11 @@ async def convert_file_to_txt(
879
  temp_files_to_delete.append(cleaned_input_filename)
880
  background_tasks.add_task(delete_temp_files, temp_files_to_delete)
881
 
882
- return FileResponse(output_filename, filename=f"{base_filename}.txt")
 
 
 
 
 
 
 
 
805
  output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
806
 
807
  text = ""
808
+ images = []
809
+ image_descriptions = {}
810
+
811
+ # Extraction du texte et des images
812
  if ext == '.pdf':
813
  text, images = extract_text_with_image_markers(input_filename)
814
  elif ext == '.pptx':
 
840
  text = f.read()
841
  images = []
842
 
843
+ # Analyse des images et récupération des descriptions
 
844
  if images:
 
845
  tasks = []
846
  for img_num, img_bytes in images:
847
  base64_image = base64.b64encode(img_bytes).decode('utf-8')
 
852
  )
853
  ))
854
 
855
+ descriptions = await asyncio.gather(*tasks, return_exceptions=True)
856
 
857
  for (img_num, _), desc in zip(images, descriptions):
858
+ if isinstance(desc, Exception):
859
+ logging.error(f"Erreur lors de la description de l'image {img_num} : {str(desc)}")
860
+ image_descriptions[img_num] = "Description indisponible."
861
+ elif desc and desc != "Description indisponible.":
862
  image_descriptions[img_num] = desc
863
  else:
864
  image_descriptions[img_num] = "Description indisponible."
 
868
  marker = f"[IMG_{img_num}]"
869
  description_text = f"Image {img_num}: {desc}"
870
  text = text.replace(marker, description_text)
 
871
 
872
+ # Écriture du texte dans le fichier de sortie
873
  with open(output_filename, "w", encoding="utf-8") as f:
874
  f.write(text)
875
 
 
882
  temp_files_to_delete.append(cleaned_input_filename)
883
  background_tasks.add_task(delete_temp_files, temp_files_to_delete)
884
 
885
+ return FileResponse(output_filename, filename=f"{base_filename}.txt")
886
+
887
+ except HTTPException as http_exc:
888
+ logging.error(f"Erreur HTTP lors de la conversion : {str(http_exc.detail)}")
889
+ return JSONResponse(status_code=http_exc.status_code, content={"message": http_exc.detail})
890
+ except Exception as e:
891
+ logging.error(f"Erreur interne lors de la conversion : {str(e)}")
892
+ return JSONResponse(status_code=500, content={"message": f"Erreur interne : {str(e)}"})