Spaces:

Bentham
/

convertToTXT

Sleeping

App Files Files Community

Bentham commited on Oct 26, 2024

Commit

d26c63c

verified ·

1 Parent(s): df2ec6f

Update main.py

Browse files

Files changed (1) hide show

main.py +14 -15

main.py CHANGED Viewed

@@ -7,7 +7,6 @@ import shutil
 import logging
 import tempfile
 from pdfminer.high_level import extract_text
-import csv
 # Initialize the logger
 logging.basicConfig(level=logging.DEBUG)
@@ -28,52 +27,52 @@ async def convert_file_to_txt(
     background_tasks: BackgroundTasks = BackgroundTasks()
 ):
     try:
         original_filename = file.filename
         base_filename, ext = os.path.splitext(original_filename)
         ext = ext.lower()
         allowed_extensions = [
-            '.odt', '.pdf', '.doc', '.docx', '.html', '.htm', '.md', '.txt', '.rtf',
-            '.epub', '.csv', '.ppt', '.pptx', '.xls', '.xlsx'
         ]
         if ext not in allowed_extensions:
             raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
         with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as input_tmp_file:
             input_filename = input_tmp_file.name
             with open(input_filename, "wb") as f:
                 shutil.copyfileobj(file.file, f)
             logging.debug(f"Uploaded file: {input_filename}")
         unique_id = uuid.uuid4().hex
         output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
-        # Conversion CSV
-        if ext == '.csv':
-            with open(input_filename, newline='') as csvfile, open(output_filename, "w") as txtfile:
-                reader = csv.reader(csvfile)
-                for row in reader:
-                    txtfile.write(' '.join(row) + '\n')
-            logging.debug(f"CSV conversion successful: {output_filename}")
-        # Conversion PDF
-        elif ext == '.pdf':
             text = extract_text(input_filename)
             with open(output_filename, "w") as f:
                 f.write(text)
             logging.debug(f"PDF conversion successful: {output_filename}")
-        # Autres formats pris en charge par Pandoc
         else:
             output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
             logging.debug(f"Conversion successful: {output_filename}")
         if not os.path.exists(output_filename):
             logging.error(f"The file {output_filename} was not generated.")
             raise HTTPException(status_code=500, detail="Error during conversion.")
         background_tasks.add_task(delete_temp_files, [input_filename, output_filename])
         return FileResponse(output_filename, filename=f"{base_filename}.txt")
     except HTTPException as http_exc:

 import logging
 import tempfile
 from pdfminer.high_level import extract_text
 # Initialize the logger
 logging.basicConfig(level=logging.DEBUG)
     background_tasks: BackgroundTasks = BackgroundTasks()
 ):
     try:
+        # Original file name and extension
         original_filename = file.filename
         base_filename, ext = os.path.splitext(original_filename)
         ext = ext.lower()
+        # Allowed extensions for conversion
         allowed_extensions = [
+            '.odt', '.pdf', '.docx', '.html', '.htm', '.md', '.txt', '.rtf', '.epub',
+            '.tex', '.xml', '.org', '.commonmark', '.cm', '.wiki', '.opml'
         ]
         if ext not in allowed_extensions:
             raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
+        # Create a temporary input file with the correct extension
         with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as input_tmp_file:
             input_filename = input_tmp_file.name
             with open(input_filename, "wb") as f:
                 shutil.copyfileobj(file.file, f)
             logging.debug(f"Uploaded file: {input_filename}")
+        # Define the output file name, keeping the same base name but with .txt extension
         unique_id = uuid.uuid4().hex
         output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
+        # PDF to text conversion
+        if ext == '.pdf':
             text = extract_text(input_filename)
             with open(output_filename, "w") as f:
                 f.write(text)
             logging.debug(f"PDF conversion successful: {output_filename}")
+        # Other file formats to text conversion using Pandoc
         else:
             output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
             logging.debug(f"Conversion successful: {output_filename}")
+        # Check if the .txt file exists
         if not os.path.exists(output_filename):
             logging.error(f"The file {output_filename} was not generated.")
             raise HTTPException(status_code=500, detail="Error during conversion.")
+        # Add temporary files to background task for deletion after sending the response
         background_tasks.add_task(delete_temp_files, [input_filename, output_filename])
+        # Return the converted file to the client, with the same base name and .txt extension
         return FileResponse(output_filename, filename=f"{base_filename}.txt")
     except HTTPException as http_exc: