Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
@@ -7,7 +7,6 @@ import shutil
|
|
7 |
import logging
|
8 |
import tempfile
|
9 |
from pdfminer.high_level import extract_text
|
10 |
-
import csv
|
11 |
|
12 |
# Initialize the logger
|
13 |
logging.basicConfig(level=logging.DEBUG)
|
@@ -28,52 +27,52 @@ async def convert_file_to_txt(
|
|
28 |
background_tasks: BackgroundTasks = BackgroundTasks()
|
29 |
):
|
30 |
try:
|
|
|
31 |
original_filename = file.filename
|
32 |
base_filename, ext = os.path.splitext(original_filename)
|
33 |
ext = ext.lower()
|
34 |
|
|
|
35 |
allowed_extensions = [
|
36 |
-
'.odt', '.pdf', '.
|
37 |
-
'.
|
38 |
]
|
|
|
39 |
if ext not in allowed_extensions:
|
40 |
raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
|
41 |
|
|
|
42 |
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as input_tmp_file:
|
43 |
input_filename = input_tmp_file.name
|
44 |
with open(input_filename, "wb") as f:
|
45 |
shutil.copyfileobj(file.file, f)
|
46 |
logging.debug(f"Uploaded file: {input_filename}")
|
47 |
|
|
|
48 |
unique_id = uuid.uuid4().hex
|
49 |
output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
|
50 |
|
51 |
-
#
|
52 |
-
if ext == '.
|
53 |
-
with open(input_filename, newline='') as csvfile, open(output_filename, "w") as txtfile:
|
54 |
-
reader = csv.reader(csvfile)
|
55 |
-
for row in reader:
|
56 |
-
txtfile.write(' '.join(row) + '\n')
|
57 |
-
logging.debug(f"CSV conversion successful: {output_filename}")
|
58 |
-
|
59 |
-
# Conversion PDF
|
60 |
-
elif ext == '.pdf':
|
61 |
text = extract_text(input_filename)
|
62 |
with open(output_filename, "w") as f:
|
63 |
f.write(text)
|
64 |
logging.debug(f"PDF conversion successful: {output_filename}")
|
65 |
-
|
66 |
-
#
|
67 |
else:
|
68 |
output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
|
69 |
logging.debug(f"Conversion successful: {output_filename}")
|
70 |
|
|
|
71 |
if not os.path.exists(output_filename):
|
72 |
logging.error(f"The file {output_filename} was not generated.")
|
73 |
raise HTTPException(status_code=500, detail="Error during conversion.")
|
74 |
|
|
|
75 |
background_tasks.add_task(delete_temp_files, [input_filename, output_filename])
|
76 |
|
|
|
77 |
return FileResponse(output_filename, filename=f"{base_filename}.txt")
|
78 |
|
79 |
except HTTPException as http_exc:
|
|
|
7 |
import logging
|
8 |
import tempfile
|
9 |
from pdfminer.high_level import extract_text
|
|
|
10 |
|
11 |
# Initialize the logger
|
12 |
logging.basicConfig(level=logging.DEBUG)
|
|
|
27 |
background_tasks: BackgroundTasks = BackgroundTasks()
|
28 |
):
|
29 |
try:
|
30 |
+
# Original file name and extension
|
31 |
original_filename = file.filename
|
32 |
base_filename, ext = os.path.splitext(original_filename)
|
33 |
ext = ext.lower()
|
34 |
|
35 |
+
# Allowed extensions for conversion
|
36 |
allowed_extensions = [
|
37 |
+
'.odt', '.pdf', '.docx', '.html', '.htm', '.md', '.txt', '.rtf', '.epub',
|
38 |
+
'.tex', '.xml', '.org', '.commonmark', '.cm', '.wiki', '.opml'
|
39 |
]
|
40 |
+
|
41 |
if ext not in allowed_extensions:
|
42 |
raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
|
43 |
|
44 |
+
# Create a temporary input file with the correct extension
|
45 |
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as input_tmp_file:
|
46 |
input_filename = input_tmp_file.name
|
47 |
with open(input_filename, "wb") as f:
|
48 |
shutil.copyfileobj(file.file, f)
|
49 |
logging.debug(f"Uploaded file: {input_filename}")
|
50 |
|
51 |
+
# Define the output file name, keeping the same base name but with .txt extension
|
52 |
unique_id = uuid.uuid4().hex
|
53 |
output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
|
54 |
|
55 |
+
# PDF to text conversion
|
56 |
+
if ext == '.pdf':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
text = extract_text(input_filename)
|
58 |
with open(output_filename, "w") as f:
|
59 |
f.write(text)
|
60 |
logging.debug(f"PDF conversion successful: {output_filename}")
|
61 |
+
|
62 |
+
# Other file formats to text conversion using Pandoc
|
63 |
else:
|
64 |
output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
|
65 |
logging.debug(f"Conversion successful: {output_filename}")
|
66 |
|
67 |
+
# Check if the .txt file exists
|
68 |
if not os.path.exists(output_filename):
|
69 |
logging.error(f"The file {output_filename} was not generated.")
|
70 |
raise HTTPException(status_code=500, detail="Error during conversion.")
|
71 |
|
72 |
+
# Add temporary files to background task for deletion after sending the response
|
73 |
background_tasks.add_task(delete_temp_files, [input_filename, output_filename])
|
74 |
|
75 |
+
# Return the converted file to the client, with the same base name and .txt extension
|
76 |
return FileResponse(output_filename, filename=f"{base_filename}.txt")
|
77 |
|
78 |
except HTTPException as http_exc:
|