Bentham commited on
Commit
999909d
·
verified ·
1 Parent(s): 5aa3923

prise en charge images doc ?

Browse files
Files changed (1) hide show
  1. main.py +22 -7
main.py CHANGED
@@ -384,8 +384,17 @@ def convert_pptx_to_html(input_filename: str) -> str:
384
  def convert_doc_to_text(input_filename: str) -> str:
385
  if 'textract' not in globals():
386
  raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
387
- text = textract.process(input_filename).decode('utf-8', errors='replace')
388
- return text
 
 
 
 
 
 
 
 
 
389
 
390
  def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
391
  try:
@@ -626,10 +635,9 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
626
  html_content = text_to_html(text)
627
  html_content = insert_page_comments_every_15_paragraphs(html_content)
628
  elif ext == '.doc':
629
- text = convert_doc_to_text(input_file_path)
630
- html_content = text_to_html(text)
631
  html_content = insert_page_comments_every_15_paragraphs(html_content)
632
- logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
633
  elif ext in ['.html', '.htm']:
634
  with open(input_file_path, 'r', encoding='utf-8') as f:
635
  raw_html = f.read()
@@ -788,8 +796,15 @@ async def convert_file_to_txt(
788
  text = convert_ppt_to_text(input_filename)
789
  html_content = text_to_html(text)
790
  elif ext == '.doc':
791
- text = convert_doc_to_text(input_filename)
792
- html_content = text_to_html(text)
 
 
 
 
 
 
 
793
  elif ext in ['.html', '.htm']:
794
  html_content = convert_with_pandoc(input_filename, 'html')
795
  else:
 
384
  def convert_doc_to_text(input_filename: str) -> str:
385
  if 'textract' not in globals():
386
  raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
387
+ try:
388
+ # Attempt to extract the document as HTML directly using textract
389
+ html_content = textract.process(input_filename, output_format='html').decode('utf-8', errors='replace')
390
+ return html_content
391
+ except Exception as e:
392
+ logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract : {str(e)}. Tentative d'extraction du texte uniquement.")
393
+ # Fallback to extracting text if HTML conversion fails
394
+ text = textract.process(input_filename).decode('utf-8', errors='replace')
395
+ # Wrap the text in basic HTML structure
396
+ html_content = "<html><head></head><body>" + text.replace('\n', '<br>') + "</body></html>"
397
+ return html_content
398
 
399
  def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
400
  try:
 
635
  html_content = text_to_html(text)
636
  html_content = insert_page_comments_every_15_paragraphs(html_content)
637
  elif ext == '.doc':
638
+ html_content = convert_doc_to_text(input_file_path)
 
639
  html_content = insert_page_comments_every_15_paragraphs(html_content)
640
+ logging.debug(f"DEBUG ACCESSIBILITY (.doc): HTML après conversion textract : {html_content[:500]}...")
641
  elif ext in ['.html', '.htm']:
642
  with open(input_file_path, 'r', encoding='utf-8') as f:
643
  raw_html = f.read()
 
796
  text = convert_ppt_to_text(input_filename)
797
  html_content = text_to_html(text)
798
  elif ext == '.doc':
799
+ try:
800
+ # Attempt to convert directly to HTML
801
+ html_content = textract.process(input_filename, output_format='html').decode('utf-8', errors='replace')
802
+ logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion textract (direct) : {html_content[:500]}...")
803
+ except Exception as e:
804
+ logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract (direct) : {str(e)}. Tentative d'extraction du texte puis conversion.")
805
+ text = convert_doc_to_text(input_filename)
806
+ html_content = text_to_html(text)
807
+ logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion text vers HTML : {html_content[:500]}...")
808
  elif ext in ['.html', '.htm']:
809
  html_content = convert_with_pandoc(input_filename, 'html')
810
  else: