Bentham commited on
Commit
4777466
·
verified ·
1 Parent(s): 4779ec7

Code 14. Tant pis pour les images dans les .doc !

Browse files
Files changed (1) hide show
  1. main.py +7 -16
main.py CHANGED
@@ -384,18 +384,8 @@ def convert_pptx_to_html(input_filename: str) -> str:
384
  def convert_doc_to_text(input_filename: str) -> str:
385
  if 'textract' not in globals():
386
  raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
387
- try:
388
- # Attempt to extract the document as HTML directly using textract
389
- html_content = textract.process(input_filename, output_format='html').decode('utf-8', errors='replace')
390
- return html_content
391
- except Exception as e:
392
- logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract : {str(e)}. Tentative d'extraction du texte uniquement.")
393
- # Fallback to extracting text if HTML conversion fails
394
- text = textract.process(input_filename).decode('utf-8', errors='replace')
395
- # Wrap the text in basic HTML structure, ensuring paragraphs are correctly formed
396
- html_lines = ['<p>' + line.strip() + '</p>' for line in text.split('\n') if line.strip()]
397
- html_content = "<html><head></head><body>" + "\n".join(html_lines) + "</body></html>"
398
- return html_content
399
 
400
  def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
401
  try:
@@ -636,9 +626,10 @@ async def process_file_accessibility(job_id: str, input_file_path: str, ext: str
636
  html_content = text_to_html(text)
637
  html_content = insert_page_comments_every_15_paragraphs(html_content)
638
  elif ext == '.doc':
639
- html_content = convert_doc_to_text(input_file_path)
 
640
  html_content = insert_page_comments_every_15_paragraphs(html_content)
641
- logging.debug(f"DEBUG ACCESSIBILITY (.doc): HTML après conversion textract : {html_content[:500]}...")
642
  elif ext in ['.html', '.htm']:
643
  with open(input_file_path, 'r', encoding='utf-8') as f:
644
  raw_html = f.read()
@@ -797,8 +788,8 @@ async def convert_file_to_txt(
797
  text = convert_ppt_to_text(input_filename)
798
  html_content = text_to_html(text)
799
  elif ext == '.doc':
800
- html_content = convert_doc_to_text(input_filename)
801
- logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion convert_doc_to_text : {html_content[:500]}...")
802
  elif ext in ['.html', '.htm']:
803
  html_content = convert_with_pandoc(input_filename, 'html')
804
  else:
 
384
  def convert_doc_to_text(input_filename: str) -> str:
385
  if 'textract' not in globals():
386
  raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
387
+ text = textract.process(input_filename).decode('utf-8', errors='replace')
388
+ return text
 
 
 
 
 
 
 
 
 
 
389
 
390
  def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
391
  try:
 
626
  html_content = text_to_html(text)
627
  html_content = insert_page_comments_every_15_paragraphs(html_content)
628
  elif ext == '.doc':
629
+ text = convert_doc_to_text(input_file_path)
630
+ html_content = text_to_html(text)
631
  html_content = insert_page_comments_every_15_paragraphs(html_content)
632
+ logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
633
  elif ext in ['.html', '.htm']:
634
  with open(input_file_path, 'r', encoding='utf-8') as f:
635
  raw_html = f.read()
 
788
  text = convert_ppt_to_text(input_filename)
789
  html_content = text_to_html(text)
790
  elif ext == '.doc':
791
+ text = convert_doc_to_text(input_filename)
792
+ html_content = text_to_html(text)
793
  elif ext in ['.html', '.htm']:
794
  html_content = convert_with_pandoc(input_filename, 'html')
795
  else: