convert_test

Sleeping

App Files Files Community

Bentham commited on Dec 20, 2024

Commit

4779ec7

verified ·

1 Parent(s): 999909d

Update main.py

Browse files

Files changed (1) hide show

main.py +5 -11

main.py CHANGED Viewed

@@ -392,8 +392,9 @@ def convert_doc_to_text(input_filename: str) -> str:
         logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract : {str(e)}. Tentative d'extraction du texte uniquement.")
         # Fallback to extracting text if HTML conversion fails
         text = textract.process(input_filename).decode('utf-8', errors='replace')
-        # Wrap the text in basic HTML structure
-        html_content = "<html><head></head><body>" + text.replace('\n', '<br>') + "</body></html>"
         return html_content
 def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
@@ -796,15 +797,8 @@ async def convert_file_to_txt(
             text = convert_ppt_to_text(input_filename)
             html_content = text_to_html(text)
         elif ext == '.doc':
-            try:
-                # Attempt to convert directly to HTML
-                html_content = textract.process(input_filename, output_format='html').decode('utf-8', errors='replace')
-                logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion textract (direct) : {html_content[:500]}...")
-            except Exception as e:
-                logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract (direct) : {str(e)}. Tentative d'extraction du texte puis conversion.")
-                text = convert_doc_to_text(input_filename)
-                html_content = text_to_html(text)
-                logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion text vers HTML : {html_content[:500]}...")
         elif ext in ['.html', '.htm']:
             html_content = convert_with_pandoc(input_filename, 'html')
         else:

         logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract : {str(e)}. Tentative d'extraction du texte uniquement.")
         # Fallback to extracting text if HTML conversion fails
         text = textract.process(input_filename).decode('utf-8', errors='replace')
+        # Wrap the text in basic HTML structure, ensuring paragraphs are correctly formed
+        html_lines = ['<p>' + line.strip() + '</p>' for line in text.split('\n') if line.strip()]
+        html_content = "<html><head></head><body>" + "\n".join(html_lines) + "</body></html>"
         return html_content
 def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
             text = convert_ppt_to_text(input_filename)
             html_content = text_to_html(text)
         elif ext == '.doc':
+            html_content = convert_doc_to_text(input_filename)
+            logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion convert_doc_to_text : {html_content[:500]}...")
         elif ext in ['.html', '.htm']:
             html_content = convert_with_pandoc(input_filename, 'html')
         else: