Spaces:

Bentham
/

convertToTXT

Sleeping

Bentham commited on Nov 4, 2024

Commit

2bbdd94

verified ·

1 Parent(s): d26c63c

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pypandoc
 import shutil
 import logging
 import tempfile
-from pdfminer.high_level import extract_text
 # Initialize the logger
 logging.basicConfig(level=logging.DEBUG)
@@ -52,12 +52,15 @@ async def convert_file_to_txt(
         unique_id = uuid.uuid4().hex
         output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
-        # PDF to text conversion
         if ext == '.pdf':
-            text = extract_text(input_filename)
-            with open(output_filename, "w") as f:
                 f.write(text)
-            logging.debug(f"PDF conversion successful: {output_filename}")
         # Other file formats to text conversion using Pandoc
         else:

 import shutil
 import logging
 import tempfile
+import fitz  # PyMuPDF importé ici
 # Initialize the logger
 logging.basicConfig(level=logging.DEBUG)
         unique_id = uuid.uuid4().hex
         output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
+        # PDF to text conversion using PyMuPDF
         if ext == '.pdf':
+            text = ""
+            with fitz.open(input_filename) as doc:
+                for page in doc:
+                    text += page.get_text()
+            with open(output_filename, "w", encoding="utf-8") as f:
                 f.write(text)
+            logging.debug(f"PDF conversion successful with PyMuPDF: {output_filename}")
         # Other file formats to text conversion using Pandoc
         else: