Spaces:
Sleeping
Sleeping
prise en charge images doc ?
Browse files
main.py
CHANGED
@@ -384,8 +384,17 @@ def convert_pptx_to_html(input_filename: str) -> str:
|
|
384 |
def convert_doc_to_text(input_filename: str) -> str:
|
385 |
if 'textract' not in globals():
|
386 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
387 |
-
|
388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
|
390 |
def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
|
391 |
try:
|
@@ -626,10 +635,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
|
|
626 |
html_content = text_to_html(text)
|
627 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
628 |
elif ext == '.doc':
|
629 |
-
|
630 |
-
html_content = text_to_html(text)
|
631 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
632 |
-
logging.debug(f"DEBUG ACCESSIBILITY (.
|
633 |
elif ext in ['.html', '.htm']:
|
634 |
with open(input_file_path, 'r', encoding='utf-8') as f:
|
635 |
raw_html = f.read()
|
@@ -788,8 +796,15 @@ async def convert_file_to_txt(
|
|
788 |
text = convert_ppt_to_text(input_filename)
|
789 |
html_content = text_to_html(text)
|
790 |
elif ext == '.doc':
|
791 |
-
|
792 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
793 |
elif ext in ['.html', '.htm']:
|
794 |
html_content = convert_with_pandoc(input_filename, 'html')
|
795 |
else:
|
|
|
384 |
def convert_doc_to_text(input_filename: str) -> str:
|
385 |
if 'textract' not in globals():
|
386 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
387 |
+
try:
|
388 |
+
# Attempt to extract the document as HTML directly using textract
|
389 |
+
html_content = textract.process(input_filename, output_format='html').decode('utf-8', errors='replace')
|
390 |
+
return html_content
|
391 |
+
except Exception as e:
|
392 |
+
logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract : {str(e)}. Tentative d'extraction du texte uniquement.")
|
393 |
+
# Fallback to extracting text if HTML conversion fails
|
394 |
+
text = textract.process(input_filename).decode('utf-8', errors='replace')
|
395 |
+
# Wrap the text in basic HTML structure
|
396 |
+
html_content = "<html><head></head><body>" + text.replace('\n', '<br>') + "</body></html>"
|
397 |
+
return html_content
|
398 |
|
399 |
def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
|
400 |
try:
|
|
|
635 |
html_content = text_to_html(text)
|
636 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
637 |
elif ext == '.doc':
|
638 |
+
html_content = convert_doc_to_text(input_file_path)
|
|
|
639 |
html_content = insert_page_comments_every_15_paragraphs(html_content)
|
640 |
+
logging.debug(f"DEBUG ACCESSIBILITY (.doc): HTML après conversion textract : {html_content[:500]}...")
|
641 |
elif ext in ['.html', '.htm']:
|
642 |
with open(input_file_path, 'r', encoding='utf-8') as f:
|
643 |
raw_html = f.read()
|
|
|
796 |
text = convert_ppt_to_text(input_filename)
|
797 |
html_content = text_to_html(text)
|
798 |
elif ext == '.doc':
|
799 |
+
try:
|
800 |
+
# Attempt to convert directly to HTML
|
801 |
+
html_content = textract.process(input_filename, output_format='html').decode('utf-8', errors='replace')
|
802 |
+
logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion textract (direct) : {html_content[:500]}...")
|
803 |
+
except Exception as e:
|
804 |
+
logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract (direct) : {str(e)}. Tentative d'extraction du texte puis conversion.")
|
805 |
+
text = convert_doc_to_text(input_filename)
|
806 |
+
html_content = text_to_html(text)
|
807 |
+
logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion text vers HTML : {html_content[:500]}...")
|
808 |
elif ext in ['.html', '.htm']:
|
809 |
html_content = convert_with_pandoc(input_filename, 'html')
|
810 |
else:
|