convert_test

Sleeping

App Files Files Community

Bentham commited on Dec 20, 2024

Commit

4777466

verified ·

1 Parent(s): 4779ec7

Code 14. Tant pis pour les images dans les .doc !

Browse files

Files changed (1) hide show

main.py +7 -16

main.py CHANGED Viewed

@@ -384,18 +384,8 @@ def convert_pptx_to_html(input_filename: str) -> str:
 def convert_doc_to_text(input_filename: str) -> str:
     if 'textract' not in globals():
         raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
-    try:
-        # Attempt to extract the document as HTML directly using textract
-        html_content = textract.process(input_filename, output_format='html').decode('utf-8', errors='replace')
-        return html_content
-    except Exception as e:
-        logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract : {str(e)}. Tentative d'extraction du texte uniquement.")
-        # Fallback to extracting text if HTML conversion fails
-        text = textract.process(input_filename).decode('utf-8', errors='replace')
-        # Wrap the text in basic HTML structure, ensuring paragraphs are correctly formed
-        html_lines = ['<p>' + line.strip() + '</p>' for line in text.split('\n') if line.strip()]
-        html_content = "<html><head></head><body>" + "\n".join(html_lines) + "</body></html>"
-        return html_content
 def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
     try:
@@ -636,9 +626,10 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
             html_content = text_to_html(text)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
         elif ext == '.doc':
-            html_content = convert_doc_to_text(input_file_path)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
-            logging.debug(f"DEBUG ACCESSIBILITY (.doc): HTML après conversion textract : {html_content[:500]}...")
         elif ext in ['.html', '.htm']:
             with open(input_file_path, 'r', encoding='utf-8') as f:
                 raw_html = f.read()
@@ -797,8 +788,8 @@ async def convert_file_to_txt(
             text = convert_ppt_to_text(input_filename)
             html_content = text_to_html(text)
         elif ext == '.doc':
-            html_content = convert_doc_to_text(input_filename)
-            logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion convert_doc_to_text : {html_content[:500]}...")
         elif ext in ['.html', '.htm']:
             html_content = convert_with_pandoc(input_filename, 'html')
         else:

 def convert_doc_to_text(input_filename: str) -> str:
     if 'textract' not in globals():
         raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
+    text = textract.process(input_filename).decode('utf-8', errors='replace')
+    return text
 def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
     try:
             html_content = text_to_html(text)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
         elif ext == '.doc':
+            text = convert_doc_to_text(input_file_path)
+            html_content = text_to_html(text)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
+            logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
         elif ext in ['.html', '.htm']:
             with open(input_file_path, 'r', encoding='utf-8') as f:
                 raw_html = f.read()
             text = convert_ppt_to_text(input_filename)
             html_content = text_to_html(text)
         elif ext == '.doc':
+            text = convert_doc_to_text(input_filename)
+            html_content = text_to_html(text)
         elif ext in ['.html', '.htm']:
             html_content = convert_with_pandoc(input_filename, 'html')
         else: