Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
@@ -355,15 +355,15 @@ def convert_ppt_to_text(input_filename: str) -> str:
|
|
355 |
if image_extension in ['png', 'jpeg', 'jpg', 'gif']:
|
356 |
mime_type = f'image/{image_extension}'
|
357 |
else:
|
358 |
-
mime_type = 'image/png' #
|
359 |
data_uri = f'data:{mime_type};base64,{image_base64}'
|
360 |
-
#
|
361 |
-
alt_text = shape.name #
|
362 |
img_tag = f'<img src="{data_uri}" alt="{alt_text}">'
|
363 |
text_content.append(img_tag)
|
364 |
-
text_content.append(shape.text)
|
365 |
return "\n".join(text_content)
|
366 |
|
|
|
367 |
def convert_doc_to_text(input_filename: str) -> str:
|
368 |
if 'textract' not in globals():
|
369 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
@@ -592,110 +592,97 @@ async def convert_file_to_html(
|
|
592 |
logging.error(f"Erreur lors du démarrage du job : {str(e)}")
|
593 |
return JSONResponse(status_code=500, content={"message": f"Erreur lors du démarrage du job : {str(e)}"})
|
594 |
|
595 |
-
async def process_file_accessibility(job_id: str, input_file_path: str, ext: str, original_filename: str):
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
html_content =
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
logging.debug(f"DEBUG ACCESSIBILITY
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
)
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
# Insérer le CSS
|
687 |
-
final_html = insert_css_into_html(final_html)
|
688 |
-
|
689 |
-
output_filename = os.path.join(job_dir, f"{base_filename}.html")
|
690 |
-
with open(output_filename, 'w', encoding='utf-8') as f:
|
691 |
-
f.write(final_html)
|
692 |
-
|
693 |
-
update_job_status(job_id, 'completed', 'Traitement terminé', result_file=f"{base_filename}.html")
|
694 |
-
delete_files_after_delay([input_file_path], delay=6000)
|
695 |
-
|
696 |
-
except Exception as e:
|
697 |
-
logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
|
698 |
-
update_job_status(job_id, 'error', f"Erreur: {str(e)}")
|
699 |
|
700 |
|
701 |
@app.get("/status/{job_id}")
|
|
|
355 |
if image_extension in ['png', 'jpeg', 'jpg', 'gif']:
|
356 |
mime_type = f'image/{image_extension}'
|
357 |
else:
|
358 |
+
mime_type = 'image/png' # Par défaut si l'extension est inconnue
|
359 |
data_uri = f'data:{mime_type};base64,{image_base64}'
|
360 |
+
# Ajouter une balise img avec le data URI ; inclure un alt si disponible
|
361 |
+
alt_text = shape.name # Ou une logique différente pour déterminer l'alt text
|
362 |
img_tag = f'<img src="{data_uri}" alt="{alt_text}">'
|
363 |
text_content.append(img_tag)
|
|
|
364 |
return "\n".join(text_content)
|
365 |
|
366 |
+
|
367 |
def convert_doc_to_text(input_filename: str) -> str:
|
368 |
if 'textract' not in globals():
|
369 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
|
|
592 |
logging.error(f"Erreur lors du démarrage du job : {str(e)}")
|
593 |
return JSONResponse(status_code=500, content={"message": f"Erreur lors du démarrage du job : {str(e)}"})
|
594 |
|
595 |
+
async def process_file_accessibility(job_id: str, input_file_path: str, ext: str, original_filename: str):
|
596 |
+
job_dir = os.path.join(JOBS_DIR, job_id)
|
597 |
+
try:
|
598 |
+
update_job_status(job_id, 'processing', 'Le fichier est en cours de traitement')
|
599 |
+
base_filename = os.path.splitext(original_filename)[0]
|
600 |
+
|
601 |
+
# Conversion en HTML sans extraire les images à ce stade
|
602 |
+
if ext == '.pdf':
|
603 |
+
html_content = pdf_to_html(input_file_path)
|
604 |
+
elif ext in ['.ppt', '.pptx']:
|
605 |
+
text = convert_ppt_to_text(input_file_path)
|
606 |
+
html_content = text_to_html(text)
|
607 |
+
elif ext == '.doc':
|
608 |
+
text = convert_doc_to_text(input_file_path)
|
609 |
+
html_content = text_to_html(text)
|
610 |
+
elif ext in ['.html', '.htm']:
|
611 |
+
with open(input_file_path, 'r', encoding='utf-8') as f:
|
612 |
+
raw_html = f.read()
|
613 |
+
try:
|
614 |
+
doc = Document(raw_html)
|
615 |
+
cleaned = doc.summary()
|
616 |
+
html_content = cleaned
|
617 |
+
except Exception as e:
|
618 |
+
logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
|
619 |
+
html_content = raw_html
|
620 |
+
else:
|
621 |
+
input_format = get_pandoc_format(ext)
|
622 |
+
html_content = convert_with_pandoc(input_file_path, input_format)
|
623 |
+
|
624 |
+
if ext == '.docx':
|
625 |
+
logging.debug(f"DEBUG ACCESSIBILITY (.docx): HTML après conversion Pandoc : {html_content[:500]}...")
|
626 |
+
|
627 |
+
# Nettoyage et extraction des images
|
628 |
+
image_counter = [1]
|
629 |
+
images_data = {}
|
630 |
+
logging.debug(f"DEBUG ACCESSIBILITY: HTML avant clean_html_content : {html_content[:500]}...")
|
631 |
+
cleaned_html = await clean_html_content(html_content, image_counter, images_data)
|
632 |
+
logging.debug(f"DEBUG ACCESSIBILITY: HTML après clean_html_content : {cleaned_html}...")
|
633 |
+
logging.debug(f"DEBUG ACCESSIBILITY: images_data après clean_html_content : {images_data}")
|
634 |
+
|
635 |
+
# Décrire les images
|
636 |
+
for image_key in images_data:
|
637 |
+
context = get_context_for_image(cleaned_html, image_key)
|
638 |
+
prompt = (
|
639 |
+
"Décris ce que l'on peut voir sur cette image, pour qu'un lecteur malvoyant puisse comprendre ce qu'elle représente."
|
640 |
+
"\nJe vais maintenant te donner les dernières phrases qui précèdent cette image. "
|
641 |
+
"Prends en compte ce contexte pour l'interpréter :\n\"" + context + "\""
|
642 |
+
)
|
643 |
+
images_data[image_key]['prompt'] = prompt
|
644 |
+
|
645 |
+
tasks = []
|
646 |
+
for image_key in images_data:
|
647 |
+
base64_image = images_data[image_key]['base64_image']
|
648 |
+
prompt = images_data[image_key]['prompt']
|
649 |
+
tasks.append((image_key, asyncio.create_task(get_image_description(base64_image, prompt))))
|
650 |
+
|
651 |
+
results = await asyncio.gather(*(t[1] for t in tasks))
|
652 |
+
for (image_key, _), description in zip(tasks, results):
|
653 |
+
images_data[image_key]['description'] = description
|
654 |
+
|
655 |
+
# Réécriture accessible
|
656 |
+
rewritten_html = await rewrite_html_accessible(cleaned_html)
|
657 |
+
|
658 |
+
# Réinsertion des images
|
659 |
+
final_html = reinsert_images(rewritten_html, images_data)
|
660 |
+
|
661 |
+
# Nettoyage final
|
662 |
+
final_soup = BeautifulSoup(final_html, 'html.parser')
|
663 |
+
scripts_to_remove = final_soup.find_all('script', src=True)
|
664 |
+
for script in scripts_to_remove:
|
665 |
+
if script['src'].startswith('https://bentham-converttohtml.hf.space/'):
|
666 |
+
script.decompose()
|
667 |
+
final_html = str(final_soup)
|
668 |
+
|
669 |
+
# Supprimer lignes contenant ```
|
670 |
+
final_html = re.sub(r'^\s*```(?:html)?\s*$', '', final_html, flags=re.MULTILINE)
|
671 |
+
|
672 |
+
# Insérer le CSS
|
673 |
+
final_html = insert_css_into_html(final_html)
|
674 |
+
|
675 |
+
output_filename = os.path.join(job_dir, f"{base_filename}.html")
|
676 |
+
with open(output_filename, 'w', encoding='utf-8') as f:
|
677 |
+
f.write(final_html)
|
678 |
+
|
679 |
+
update_job_status(job_id, 'completed', 'Traitement terminé', result_file=f"{base_filename}.html")
|
680 |
+
delete_files_after_delay([input_file_path], delay=6000)
|
681 |
+
|
682 |
+
except Exception as e:
|
683 |
+
logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
|
684 |
+
update_job_status(job_id, 'error', f"Erreur: {str(e)}")
|
685 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
686 |
|
687 |
|
688 |
@app.get("/status/{job_id}")
|