convert_test

Sleeping

App Files Files Community

Bentham commited on Dec 20, 2024

Commit

999909d

verified ·

1 Parent(s): 5aa3923

prise en charge images doc ?

Browse files

Files changed (1) hide show

main.py +22 -7

main.py CHANGED Viewed

@@ -384,8 +384,17 @@ def convert_pptx_to_html(input_filename: str) -> str:
 def convert_doc_to_text(input_filename: str) -> str:
     if 'textract' not in globals():
         raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
-    text = textract.process(input_filename).decode('utf-8', errors='replace')
-    return text
 def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
     try:
@@ -626,10 +635,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
             html_content = text_to_html(text)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
         elif ext == '.doc':
-            text = convert_doc_to_text(input_file_path)
-            html_content = text_to_html(text)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
-            logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
         elif ext in ['.html', '.htm']:
             with open(input_file_path, 'r', encoding='utf-8') as f:
                 raw_html = f.read()
@@ -788,8 +796,15 @@ async def convert_file_to_txt(
             text = convert_ppt_to_text(input_filename)
             html_content = text_to_html(text)
         elif ext == '.doc':
-            text = convert_doc_to_text(input_filename)
-            html_content = text_to_html(text)
         elif ext in ['.html', '.htm']:
             html_content = convert_with_pandoc(input_filename, 'html')
         else:

 def convert_doc_to_text(input_filename: str) -> str:
     if 'textract' not in globals():
         raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
+    try:
+        # Attempt to extract the document as HTML directly using textract
+        html_content = textract.process(input_filename, output_format='html').decode('utf-8', errors='replace')
+        return html_content
+    except Exception as e:
+        logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract : {str(e)}. Tentative d'extraction du texte uniquement.")
+        # Fallback to extracting text if HTML conversion fails
+        text = textract.process(input_filename).decode('utf-8', errors='replace')
+        # Wrap the text in basic HTML structure
+        html_content = "<html><head></head><body>" + text.replace('\n', '<br>') + "</body></html>"
+        return html_content
 def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
     try:
             html_content = text_to_html(text)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
         elif ext == '.doc':
+            html_content = convert_doc_to_text(input_file_path)
             html_content = insert_page_comments_every_15_paragraphs(html_content)
+            logging.debug(f"DEBUG ACCESSIBILITY (.doc): HTML après conversion textract : {html_content[:500]}...")
         elif ext in ['.html', '.htm']:
             with open(input_file_path, 'r', encoding='utf-8') as f:
                 raw_html = f.read()
             text = convert_ppt_to_text(input_filename)
             html_content = text_to_html(text)
         elif ext == '.doc':
+            try:
+                # Attempt to convert directly to HTML
+                html_content = textract.process(input_filename, output_format='html').decode('utf-8', errors='replace')
+                logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion textract (direct) : {html_content[:500]}...")
+            except Exception as e:
+                logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract (direct) : {str(e)}. Tentative d'extraction du texte puis conversion.")
+                text = convert_doc_to_text(input_filename)
+                html_content = text_to_html(text)
+                logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion text vers HTML : {html_content[:500]}...")
         elif ext in ['.html', '.htm']:
             html_content = convert_with_pandoc(input_filename, 'html')
         else: