Bentham commited on
Commit
2bbdd94
·
verified ·
1 Parent(s): d26c63c

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +8 -5
main.py CHANGED
@@ -6,7 +6,7 @@ import pypandoc
6
  import shutil
7
  import logging
8
  import tempfile
9
- from pdfminer.high_level import extract_text
10
 
11
  # Initialize the logger
12
  logging.basicConfig(level=logging.DEBUG)
@@ -52,12 +52,15 @@ async def convert_file_to_txt(
52
  unique_id = uuid.uuid4().hex
53
  output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
54
 
55
- # PDF to text conversion
56
  if ext == '.pdf':
57
- text = extract_text(input_filename)
58
- with open(output_filename, "w") as f:
 
 
 
59
  f.write(text)
60
- logging.debug(f"PDF conversion successful: {output_filename}")
61
 
62
  # Other file formats to text conversion using Pandoc
63
  else:
 
6
  import shutil
7
  import logging
8
  import tempfile
9
+ import fitz # PyMuPDF importé ici
10
 
11
  # Initialize the logger
12
  logging.basicConfig(level=logging.DEBUG)
 
52
  unique_id = uuid.uuid4().hex
53
  output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
54
 
55
+ # PDF to text conversion using PyMuPDF
56
  if ext == '.pdf':
57
+ text = ""
58
+ with fitz.open(input_filename) as doc:
59
+ for page in doc:
60
+ text += page.get_text()
61
+ with open(output_filename, "w", encoding="utf-8") as f:
62
  f.write(text)
63
+ logging.debug(f"PDF conversion successful with PyMuPDF: {output_filename}")
64
 
65
  # Other file formats to text conversion using Pandoc
66
  else: