Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
@@ -392,8 +392,9 @@ def convert_doc_to_text(input_filename: str) -> str:
|
|
392 |
logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract : {str(e)}. Tentative d'extraction du texte uniquement.")
|
393 |
# Fallback to extracting text if HTML conversion fails
|
394 |
text = textract.process(input_filename).decode('utf-8', errors='replace')
|
395 |
-
# Wrap the text in basic HTML structure
|
396 |
-
|
|
|
397 |
return html_content
|
398 |
|
399 |
def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
|
@@ -796,15 +797,8 @@ async def convert_file_to_txt(
|
|
796 |
text = convert_ppt_to_text(input_filename)
|
797 |
html_content = text_to_html(text)
|
798 |
elif ext == '.doc':
|
799 |
-
|
800 |
-
|
801 |
-
html_content = textract.process(input_filename, output_format='html').decode('utf-8', errors='replace')
|
802 |
-
logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion textract (direct) : {html_content[:500]}...")
|
803 |
-
except Exception as e:
|
804 |
-
logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract (direct) : {str(e)}. Tentative d'extraction du texte puis conversion.")
|
805 |
-
text = convert_doc_to_text(input_filename)
|
806 |
-
html_content = text_to_html(text)
|
807 |
-
logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion text vers HTML : {html_content[:500]}...")
|
808 |
elif ext in ['.html', '.htm']:
|
809 |
html_content = convert_with_pandoc(input_filename, 'html')
|
810 |
else:
|
|
|
392 |
logging.warning(f"Erreur lors de la conversion DOC vers HTML avec textract : {str(e)}. Tentative d'extraction du texte uniquement.")
|
393 |
# Fallback to extracting text if HTML conversion fails
|
394 |
text = textract.process(input_filename).decode('utf-8', errors='replace')
|
395 |
+
# Wrap the text in basic HTML structure, ensuring paragraphs are correctly formed
|
396 |
+
html_lines = ['<p>' + line.strip() + '</p>' for line in text.split('\n') if line.strip()]
|
397 |
+
html_content = "<html><head></head><body>" + "\n".join(html_lines) + "</body></html>"
|
398 |
return html_content
|
399 |
|
400 |
def clean_html_file(input_filepath: str, cleaned_output_filepath: str) -> bool:
|
|
|
797 |
text = convert_ppt_to_text(input_filename)
|
798 |
html_content = text_to_html(text)
|
799 |
elif ext == '.doc':
|
800 |
+
html_content = convert_doc_to_text(input_filename)
|
801 |
+
logging.debug(f"DEBUG CONVERT_TO_TXT (.doc): HTML après conversion convert_doc_to_text : {html_content[:500]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
802 |
elif ext in ['.html', '.htm']:
|
803 |
html_content = convert_with_pandoc(input_filename, 'html')
|
804 |
else:
|