Bentham commited on
Commit
4779ec7
·
verified ·
1 Parent(s): 999909d

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +5 -11
main.py CHANGED
@@ -392,8 +392,9 @@ def convert_doc_to_text(input_filename: str) -> str:
392
  logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract : {str(e)}. Tentative d'extraction du texte uniquement.")
393
  # Fallback to extracting text if HTML conversion fails
394
  text = textract.process(input_filename).decode('utf-8', errors='replace')
395
- # Wrap the text in basic HTML structure
396
- html_content = "<html><head></head><body>" + text.replace('\n', '<br>') + "</body></html>"
 
397
  return html_content
398
 
399
  def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
@@ -796,15 +797,8 @@ async def convert_file_to_txt(
796
  text = convert_ppt_to_text(input_filename)
797
  html_content = text_to_html(text)
798
  elif ext == '.doc':
799
- try:
800
- # Attempt to convert directly to HTML
801
- html_content = textract.process(input_filename, output_format='html').decode('utf-8', errors='replace')
802
- logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion textract (direct) : {html_content[:500]}...")
803
- except Exception as e:
804
- logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract (direct) : {str(e)}. Tentative d'extraction du texte puis conversion.")
805
- text = convert_doc_to_text(input_filename)
806
- html_content = text_to_html(text)
807
- logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion text vers HTML : {html_content[:500]}...")
808
  elif ext in ['.html', '.htm']:
809
  html_content = convert_with_pandoc(input_filename, 'html')
810
  else:
 
392
  logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract : {str(e)}. Tentative d'extraction du texte uniquement.")
393
  # Fallback to extracting text if HTML conversion fails
394
  text = textract.process(input_filename).decode('utf-8', errors='replace')
395
+ # Wrap the text in basic HTML structure, ensuring paragraphs are correctly formed
396
+ html_lines = ['<p>' + line.strip() + '</p>' for line in text.split('\n') if line.strip()]
397
+ html_content = "<html><head></head><body>" + "\n".join(html_lines) + "</body></html>"
398
  return html_content
399
 
400
  def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
 
797
  text = convert_ppt_to_text(input_filename)
798
  html_content = text_to_html(text)
799
  elif ext == '.doc':
800
+ html_content = convert_doc_to_text(input_filename)
801
+ logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion convert_doc_to_text : {html_content[:500]}...")
 
 
 
 
 
 
 
802
  elif ext in ['.html', '.htm']:
803
  html_content = convert_with_pandoc(input_filename, 'html')
804
  else: