Spaces:
Sleeping
Sleeping
Code 14. Tant pis pour les images dans les .doc !
Browse files
main.py
CHANGED
@@ -384,18 +384,8 @@ def convert_pptx_to_html(input_filename: str) -> str:
|
|
384 |
def convert_doc_to_text(input_filename: str) -> str:
|
385 |
if 'textract' not in globals():
|
386 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
387 |
-
|
388 |
-
|
389 |
-
html_content = textract.process(input_filename, output_format='html').decode('utf-8', errors='replace')
|
390 |
-
return html_content
|
391 |
-
except Exception as e:
|
392 |
-
logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract : {str(e)}. Tentative d'extraction du texte uniquement.")
|
393 |
-
# Fallback to extracting text if HTML conversion fails
|
394 |
-
text = textract.process(input_filename).decode('utf-8', errors='replace')
|
395 |
-
# Wrap the text in basic HTML structure, ensuring paragraphs are correctly formed
|
396 |
-
html_lines = ['<p>' + line.strip() + '</p>' for line in text.split('\n') if line.strip()]
|
397 |
-
html_content = "<html><head></head><body>" + "\n".join(html_lines) + "</body></html>"
|
398 |
-
return html_content
|
399 |
|
400 |
def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
|
401 |
try:
|
@@ -636,9 +626,10 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
|
|
636 |
html_content = text_to_html(text)
|
637 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
638 |
elif ext == '.doc':
|
639 |
-
|
|
|
640 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
641 |
-
logging.debug(f"DEBUG ACCESSIBILITY (.
|
642 |
elif ext in ['.html', '.htm']:
|
643 |
with open(input_file_path, 'r', encoding='utf-8') as f:
|
644 |
raw_html = f.read()
|
@@ -797,8 +788,8 @@ async def convert_file_to_txt(
|
|
797 |
text = convert_ppt_to_text(input_filename)
|
798 |
html_content = text_to_html(text)
|
799 |
elif ext == '.doc':
|
800 |
-
|
801 |
-
|
802 |
elif ext in ['.html', '.htm']:
|
803 |
html_content = convert_with_pandoc(input_filename, 'html')
|
804 |
else:
|
|
|
384 |
def convert_doc_to_text(input_filename: str) -> str:
|
385 |
if 'textract' not in globals():
|
386 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
387 |
+
text = textract.process(input_filename).decode('utf-8', errors='replace')
|
388 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
|
390 |
def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
|
391 |
try:
|
|
|
626 |
html_content = text_to_html(text)
|
627 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
628 |
elif ext == '.doc':
|
629 |
+
text = convert_doc_to_text(input_file_path)
|
630 |
+
html_content = text_to_html(text)
|
631 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
632 |
+
logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
633 |
elif ext in ['.html', '.htm']:
|
634 |
with open(input_file_path, 'r', encoding='utf-8') as f:
|
635 |
raw_html = f.read()
|
|
|
788 |
text = convert_ppt_to_text(input_filename)
|
789 |
html_content = text_to_html(text)
|
790 |
elif ext == '.doc':
|
791 |
+
text = convert_doc_to_text(input_filename)
|
792 |
+
html_content = text_to_html(text)
|
793 |
elif ext in ['.html', '.htm']:
|
794 |
html_content = convert_with_pandoc(input_filename, 'html')
|
795 |
else:
|