Bentham commited on
Commit
d26c63c
·
verified ·
1 Parent(s): df2ec6f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +14 -15
main.py CHANGED
@@ -7,7 +7,6 @@ import shutil
7
  import logging
8
  import tempfile
9
  from pdfminer.high_level import extract_text
10
- import csv
11
 
12
  # Initialize the logger
13
  logging.basicConfig(level=logging.DEBUG)
@@ -28,52 +27,52 @@ async def convert_file_to_txt(
28
  background_tasks: BackgroundTasks = BackgroundTasks()
29
  ):
30
  try:
 
31
  original_filename = file.filename
32
  base_filename, ext = os.path.splitext(original_filename)
33
  ext = ext.lower()
34
 
 
35
  allowed_extensions = [
36
- '.odt', '.pdf', '.doc', '.docx', '.html', '.htm', '.md', '.txt', '.rtf',
37
- '.epub', '.csv', '.ppt', '.pptx', '.xls', '.xlsx'
38
  ]
 
39
  if ext not in allowed_extensions:
40
  raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
41
 
 
42
  with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as input_tmp_file:
43
  input_filename = input_tmp_file.name
44
  with open(input_filename, "wb") as f:
45
  shutil.copyfileobj(file.file, f)
46
  logging.debug(f"Uploaded file: {input_filename}")
47
 
 
48
  unique_id = uuid.uuid4().hex
49
  output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
50
 
51
- # Conversion CSV
52
- if ext == '.csv':
53
- with open(input_filename, newline='') as csvfile, open(output_filename, "w") as txtfile:
54
- reader = csv.reader(csvfile)
55
- for row in reader:
56
- txtfile.write(' '.join(row) + '\n')
57
- logging.debug(f"CSV conversion successful: {output_filename}")
58
-
59
- # Conversion PDF
60
- elif ext == '.pdf':
61
  text = extract_text(input_filename)
62
  with open(output_filename, "w") as f:
63
  f.write(text)
64
  logging.debug(f"PDF conversion successful: {output_filename}")
65
-
66
- # Autres formats pris en charge par Pandoc
67
  else:
68
  output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
69
  logging.debug(f"Conversion successful: {output_filename}")
70
 
 
71
  if not os.path.exists(output_filename):
72
  logging.error(f"The file {output_filename} was not generated.")
73
  raise HTTPException(status_code=500, detail="Error during conversion.")
74
 
 
75
  background_tasks.add_task(delete_temp_files, [input_filename, output_filename])
76
 
 
77
  return FileResponse(output_filename, filename=f"{base_filename}.txt")
78
 
79
  except HTTPException as http_exc:
 
7
  import logging
8
  import tempfile
9
  from pdfminer.high_level import extract_text
 
10
 
11
  # Initialize the logger
12
  logging.basicConfig(level=logging.DEBUG)
 
27
  background_tasks: BackgroundTasks = BackgroundTasks()
28
  ):
29
  try:
30
+ # Original file name and extension
31
  original_filename = file.filename
32
  base_filename, ext = os.path.splitext(original_filename)
33
  ext = ext.lower()
34
 
35
+ # Allowed extensions for conversion
36
  allowed_extensions = [
37
+ '.odt', '.pdf', '.docx', '.html', '.htm', '.md', '.txt', '.rtf', '.epub',
38
+ '.tex', '.xml', '.org', '.commonmark', '.cm', '.wiki', '.opml'
39
  ]
40
+
41
  if ext not in allowed_extensions:
42
  raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
43
 
44
+ # Create a temporary input file with the correct extension
45
  with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as input_tmp_file:
46
  input_filename = input_tmp_file.name
47
  with open(input_filename, "wb") as f:
48
  shutil.copyfileobj(file.file, f)
49
  logging.debug(f"Uploaded file: {input_filename}")
50
 
51
+ # Define the output file name, keeping the same base name but with .txt extension
52
  unique_id = uuid.uuid4().hex
53
  output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
54
 
55
+ # PDF to text conversion
56
+ if ext == '.pdf':
 
 
 
 
 
 
 
 
57
  text = extract_text(input_filename)
58
  with open(output_filename, "w") as f:
59
  f.write(text)
60
  logging.debug(f"PDF conversion successful: {output_filename}")
61
+
62
+ # Other file formats to text conversion using Pandoc
63
  else:
64
  output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
65
  logging.debug(f"Conversion successful: {output_filename}")
66
 
67
+ # Check if the .txt file exists
68
  if not os.path.exists(output_filename):
69
  logging.error(f"The file {output_filename} was not generated.")
70
  raise HTTPException(status_code=500, detail="Error during conversion.")
71
 
72
+ # Add temporary files to background task for deletion after sending the response
73
  background_tasks.add_task(delete_temp_files, [input_filename, output_filename])
74
 
75
+ # Return the converted file to the client, with the same base name and .txt extension
76
  return FileResponse(output_filename, filename=f"{base_filename}.txt")
77
 
78
  except HTTPException as http_exc: