Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -26,7 +26,6 @@ from bs4 import BeautifulSoup, Comment
|
|
| 26 |
|
| 27 |
try:
|
| 28 |
from pptx import Presentation
|
| 29 |
-
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
| 30 |
except ImportError:
|
| 31 |
pass
|
| 32 |
|
|
@@ -91,7 +90,7 @@ def get_job_status(job_id: str):
|
|
| 91 |
status_data = json.load(f)
|
| 92 |
return status_data
|
| 93 |
|
| 94 |
-
|
| 95 |
job_dir = os.path.join(JOBS_DIR, job_id)
|
| 96 |
try:
|
| 97 |
update_job_status(job_id, 'processing', 'Le fichier est en cours de traitement')
|
|
@@ -101,9 +100,16 @@ async def process_file(job_id: str, input_file_path: str, ext: str, original_fil
|
|
| 101 |
base_filename = os.path.splitext(original_filename)[0]
|
| 102 |
output_filename = os.path.join(job_dir, f"{base_filename}.html")
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
if not final_html:
|
| 109 |
update_job_status(job_id, 'error', 'Erreur lors de la conversion.')
|
|
@@ -139,23 +145,35 @@ def delete_files_after_delay(file_paths: List[str], delay: int = 6000):
|
|
| 139 |
async def convert_to_accessible_html(input_filename, ext, base_filename, image_counter, images_data):
|
| 140 |
try:
|
| 141 |
if ext == '.pdf':
|
| 142 |
-
# PDF -> HTML avec pages
|
| 143 |
html_content = pdf_to_html(input_filename)
|
| 144 |
-
# Pour le PDF, on a déjà des <!--PAGE_X--> par page
|
| 145 |
elif ext in ['.ppt', '.pptx']:
|
| 146 |
# PPT/PPTX -> texte -> HTML minimal
|
| 147 |
text = convert_ppt_to_text(input_filename)
|
| 148 |
html_content = text_to_html(text)
|
|
|
|
| 149 |
html_content = insert_page_comments_every_20_paragraphs(html_content)
|
| 150 |
elif ext == '.doc':
|
| 151 |
# DOC -> texte (textract) -> HTML minimal
|
| 152 |
text = convert_doc_to_text(input_filename)
|
| 153 |
html_content = text_to_html(text)
|
| 154 |
html_content = insert_page_comments_every_20_paragraphs(html_content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
else:
|
| 156 |
# Formats gérés par Pandoc
|
| 157 |
input_format = get_pandoc_format(ext)
|
| 158 |
html_content = convert_with_pandoc(input_filename, input_format)
|
|
|
|
| 159 |
html_content = insert_page_comments_every_20_paragraphs(html_content)
|
| 160 |
|
| 161 |
# Nettoyage
|
|
@@ -165,24 +183,10 @@ async def convert_to_accessible_html(input_filename, ext, base_filename, image_c
|
|
| 165 |
html_rewrite_task = asyncio.create_task(rewrite_html_accessible(cleaned_html))
|
| 166 |
|
| 167 |
# Traitement des images (description)
|
| 168 |
-
tasks = []
|
| 169 |
for image_key in images_data:
|
| 170 |
base64_image = images_data[image_key]['base64_image']
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
base64_image,
|
| 174 |
-
prompt="Décris ce que l'on peut voir sur cette image, pour qu'un lecteur malvoyant puisse comprendre ce qu'elle représente."
|
| 175 |
-
)
|
| 176 |
-
)))
|
| 177 |
-
|
| 178 |
-
results = await asyncio.gather(*(t for _, t in tasks), return_exceptions=True)
|
| 179 |
-
|
| 180 |
-
for (image_key, _), description in zip(tasks, results):
|
| 181 |
-
if isinstance(description, Exception):
|
| 182 |
-
logging.error(f"Erreur lors de la description de l'image {image_key} : {str(description)}")
|
| 183 |
-
images_data[image_key]['description'] = "Description indisponible."
|
| 184 |
-
else:
|
| 185 |
-
images_data[image_key]['description'] = description
|
| 186 |
|
| 187 |
await html_rewrite_task
|
| 188 |
rewritten_html = html_rewrite_task.result()
|
|
@@ -211,11 +215,13 @@ async def convert_to_accessible_html(input_filename, ext, base_filename, image_c
|
|
| 211 |
return None
|
| 212 |
|
| 213 |
def insert_page_comments_every_20_paragraphs(html_content: str) -> str:
|
|
|
|
| 214 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 215 |
paragraphs = soup.find_all('p')
|
| 216 |
page_number = 1
|
|
|
|
| 217 |
for i, p in enumerate(paragraphs, start=1):
|
| 218 |
-
if i % 20 == 1:
|
| 219 |
comment = soup.new_string(f"<!--PAGE_{page_number}-->")
|
| 220 |
p.insert_before(comment)
|
| 221 |
page_number += 1
|
|
@@ -223,7 +229,6 @@ def insert_page_comments_every_20_paragraphs(html_content: str) -> str:
|
|
| 223 |
|
| 224 |
def insert_css_into_html(html_content: str) -> str:
|
| 225 |
css_code = """
|
| 226 |
-
/* Votre code CSS ici */
|
| 227 |
:root {
|
| 228 |
--font-size-min: 1rem;
|
| 229 |
--font-size-base: 1rem;
|
|
@@ -395,44 +400,38 @@ def markdown_to_html(markdown_text: str) -> str:
|
|
| 395 |
html = re.sub(r'_(.*?)_', r'<i>\1</i>', html)
|
| 396 |
return html
|
| 397 |
|
| 398 |
-
|
| 399 |
-
# On rend le prompt pour get_image_description paramétrable
|
| 400 |
-
async def get_image_description(base64_image: str, prompt: str) -> str:
|
| 401 |
try:
|
| 402 |
-
# Préparer le contenu avec le prompt et l'image en markdown
|
| 403 |
-
content = f"{prompt}\n\n"
|
| 404 |
-
logging.debug(f"Contenu envoyé à l'API OpenAI : {content}")
|
| 405 |
-
|
| 406 |
response = await client.chat.completions.create(
|
| 407 |
model="gpt-4o-mini",
|
| 408 |
messages=[
|
| 409 |
{
|
| 410 |
"role": "user",
|
| 411 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
}
|
| 413 |
],
|
| 414 |
)
|
| 415 |
-
logging.debug(f"Réponse de l'API OpenAI : {response}")
|
| 416 |
-
|
| 417 |
-
if not response.choices:
|
| 418 |
-
logging.error("Aucune réponse reçue de l'API OpenAI.")
|
| 419 |
-
return "Description indisponible."
|
| 420 |
-
|
| 421 |
description = response.choices[0].message.content.strip()
|
| 422 |
-
logging.debug(f"Description obtenue : {description}")
|
| 423 |
return description
|
| 424 |
except Exception as e:
|
| 425 |
logging.error(f"Erreur lors de l'appel à l'API OpenAI : {str(e)}")
|
| 426 |
return "Description indisponible."
|
| 427 |
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
# MODIFICATIONS END
|
| 431 |
-
|
| 432 |
async def rewrite_html_accessible(html_content: str) -> str:
|
| 433 |
prompt = (
|
| 434 |
"Je vais te donner un fichier HTML, et je voudrais que tu le réécrives pour permettre l'accessibilité à toutes les formes de handicap, tout en **préservant strictement l'ordre du contenu original**.\n"
|
| 435 |
-
|
| 436 |
"- A priori, les titres qui sont préfixés par une écriture romaine (I, II, III), "
|
| 437 |
"par un nombre (1, 2, 3) ou par une lettre (a, b, c, ou bien A, B, C) doivent être de même niveau."
|
| 438 |
"Idem pour les titres rédigés en majuscules.\n"
|
|
@@ -470,7 +469,7 @@ async def rewrite_html_accessible(html_content: str) -> str:
|
|
| 470 |
response = await client.chat.completions.create(
|
| 471 |
model="o1-mini",
|
| 472 |
messages=[
|
| 473 |
-
{"role": "user", "content": prompt
|
| 474 |
],
|
| 475 |
)
|
| 476 |
rewritten_html = response.choices[0].message.content.strip()
|
|
@@ -553,7 +552,7 @@ def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -
|
|
| 553 |
if image_key in images_data:
|
| 554 |
img_tag = soup.new_tag('img')
|
| 555 |
img_tag['src'] = f"data:image/jpeg;base64,{images_data[image_key]['base64_image']}"
|
| 556 |
-
img_tag['alt'] = images_data[image_key]
|
| 557 |
|
| 558 |
new_content = soup.new_tag('div')
|
| 559 |
new_content.append(img_tag)
|
|
@@ -564,19 +563,17 @@ def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -
|
|
| 564 |
p_tag.append(strong_tag)
|
| 565 |
p_tag.append(" : ")
|
| 566 |
|
| 567 |
-
y_markdown = images_data[image_key]
|
| 568 |
y_html = markdown_to_html(y_markdown)
|
| 569 |
y_soup = BeautifulSoup(y_html, 'html.parser')
|
| 570 |
p_tag.append(y_soup)
|
| 571 |
|
| 572 |
new_content.append(p_tag)
|
| 573 |
comment.replace_with(new_content)
|
| 574 |
-
logging.debug(f"Image {image_number} réinsérée avec description.")
|
| 575 |
else:
|
| 576 |
logging.error(f"Données pour {image_key} non trouvées.")
|
| 577 |
-
return str(soup)
|
| 578 |
-
|
| 579 |
|
|
|
|
| 580 |
|
| 581 |
def pdf_to_html(input_filename: str) -> str:
|
| 582 |
soup = BeautifulSoup("<html><head></head><body></body></html>", 'html.parser')
|
|
@@ -676,7 +673,6 @@ async def convert_file_to_html(
|
|
| 676 |
with open(status_file, 'w') as f:
|
| 677 |
json.dump(status, f)
|
| 678 |
|
| 679 |
-
# Ajoutez la tâche asynchrone
|
| 680 |
background_tasks.add_task(process_file, job_id, input_file_path, ext, file.filename)
|
| 681 |
|
| 682 |
return JSONResponse(content={'job_id': job_id})
|
|
@@ -717,110 +713,6 @@ def delete_temp_files(file_paths: list):
|
|
| 717 |
except Exception as e:
|
| 718 |
logging.error(f"Erreur lors de la suppression du fichier {file_path} : {str(e)}")
|
| 719 |
|
| 720 |
-
# MODIFICATIONS START
|
| 721 |
-
def extract_images_from_pdf(input_filename: str) -> List[bytes]:
|
| 722 |
-
images = []
|
| 723 |
-
with fitz.open(input_filename) as doc:
|
| 724 |
-
smask_xrefs = set() # Servira à lister les xrefs d'images masques
|
| 725 |
-
main_images = [] # Servira à lister les infos des images principales
|
| 726 |
-
|
| 727 |
-
for page_num, page in enumerate(doc, start=1):
|
| 728 |
-
img_list = page.get_images(full=True)
|
| 729 |
-
logging.debug(f"Page {page_num} contient {len(img_list)} images.")
|
| 730 |
-
for img_index, img in enumerate(img_list, start=1):
|
| 731 |
-
if len(img) < 7:
|
| 732 |
-
logging.warning(f"Image {img_index} sur la page {page_num} a moins de 7 éléments : {img}")
|
| 733 |
-
continue # Ignorer les images avec une structure inattendue
|
| 734 |
-
|
| 735 |
-
xref, smask, width, height, bpc, colorspace, filters = img[:7]
|
| 736 |
-
# Si smask != 0, c'est une image principale avec un masque
|
| 737 |
-
# On ajoute le xref du masque à la liste des images à ignorer
|
| 738 |
-
if smask != 0:
|
| 739 |
-
smask_xrefs.add(smask)
|
| 740 |
-
logging.debug(f"Image {img_index} sur la page {page_num} a un masque (smask={smask}).")
|
| 741 |
-
# On enregistre l'image principale
|
| 742 |
-
main_images.append((xref, smask))
|
| 743 |
-
|
| 744 |
-
logging.debug(f"Nombre total d'images principales à extraire : {len(main_images)}")
|
| 745 |
-
# Maintenant on extrait uniquement les images qui ne sont pas des masques
|
| 746 |
-
for (xref, smask) in main_images:
|
| 747 |
-
# Si xref est dans smask_xrefs, c'est une image de masque à ignorer
|
| 748 |
-
if xref in smask_xrefs:
|
| 749 |
-
logging.debug(f"Image xref={xref} est un masque, elle sera ignorée.")
|
| 750 |
-
continue
|
| 751 |
-
try:
|
| 752 |
-
base_image = doc.extract_image(xref)
|
| 753 |
-
image_bytes = base_image["image"]
|
| 754 |
-
images.append(image_bytes)
|
| 755 |
-
logging.debug(f"Image xref={xref} extraite avec succès.")
|
| 756 |
-
except Exception as e:
|
| 757 |
-
logging.error(f"Erreur lors de l'extraction de l'image xref={xref} : {str(e)}")
|
| 758 |
-
logging.info(f"Extraction des images terminée. Nombre total d'images extraites : {len(images)}")
|
| 759 |
-
return images
|
| 760 |
-
|
| 761 |
-
def extract_text_with_image_markers(input_filename: str) -> Tuple[str, List[Tuple[int, bytes]]]:
|
| 762 |
-
"""
|
| 763 |
-
Extrait le texte d'un PDF en insérant des marqueurs pour les images.
|
| 764 |
-
|
| 765 |
-
Args:
|
| 766 |
-
input_filename (str): Chemin vers le fichier PDF.
|
| 767 |
-
|
| 768 |
-
Returns:
|
| 769 |
-
Tuple[str, List[Tuple[int, bytes]]]: Le texte extrait avec des marqueurs et une liste d'images extraites.
|
| 770 |
-
"""
|
| 771 |
-
text = ""
|
| 772 |
-
images = []
|
| 773 |
-
with fitz.open(input_filename) as doc:
|
| 774 |
-
for page_num, page in enumerate(doc, start=1):
|
| 775 |
-
text += f"<!--PAGE_{page_num}-->\n"
|
| 776 |
-
|
| 777 |
-
# Extraction du texte
|
| 778 |
-
page_text = page.get_text("text")
|
| 779 |
-
text += page_text + '\n'
|
| 780 |
-
|
| 781 |
-
# Extraction des images
|
| 782 |
-
image_list = page.get_images(full=True)
|
| 783 |
-
for img in image_list:
|
| 784 |
-
xref = img[0]
|
| 785 |
-
try:
|
| 786 |
-
base_image = doc.extract_image(xref)
|
| 787 |
-
image_bytes = base_image["image"]
|
| 788 |
-
|
| 789 |
-
img_num = len(images) + 1
|
| 790 |
-
marker = f"[IMG_{img_num}]"
|
| 791 |
-
text += marker + '\n'
|
| 792 |
-
|
| 793 |
-
images.append((img_num, image_bytes))
|
| 794 |
-
logging.debug(f"Image {img_num} extraite de la page {page_num}.")
|
| 795 |
-
except Exception as e:
|
| 796 |
-
logging.error(f"Erreur lors de l'extraction de l'image xref={xref} sur la page {page_num} : {str(e)}")
|
| 797 |
-
|
| 798 |
-
logging.debug(f"Page {page_num}: {len(images)} images extraites jusqu'à présent.")
|
| 799 |
-
|
| 800 |
-
logging.debug(f"Total text length: {len(text)} caractères.")
|
| 801 |
-
logging.debug(f"Total images extraites: {len(images)}.")
|
| 802 |
-
return text, images
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
def extract_images_from_ppt(input_filename: str) -> List[Tuple[int, bytes]]:
|
| 807 |
-
images = []
|
| 808 |
-
if 'Presentation' not in globals():
|
| 809 |
-
return images
|
| 810 |
-
prs = Presentation(input_filename)
|
| 811 |
-
img_num = 1 # Compteur pour numéroter les images
|
| 812 |
-
for slide in prs.slides:
|
| 813 |
-
for shape in slide.shapes:
|
| 814 |
-
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
| 815 |
-
image = shape.image
|
| 816 |
-
image_bytes = image.blob
|
| 817 |
-
images.append((img_num, image_bytes))
|
| 818 |
-
img_num += 1
|
| 819 |
-
return images
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
# MODIFICATIONS END
|
| 823 |
-
|
| 824 |
@app.post("/convert_to_txt/")
|
| 825 |
async def convert_file_to_txt(
|
| 826 |
file: UploadFile = File(...),
|
|
@@ -857,14 +749,13 @@ async def convert_file_to_txt(
|
|
| 857 |
unique_id = uuid.uuid4().hex
|
| 858 |
output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
|
| 859 |
|
| 860 |
-
text = ""
|
| 861 |
-
images = []
|
| 862 |
-
image_descriptions = {}
|
| 863 |
-
|
| 864 |
-
# Extraction du texte et des images
|
| 865 |
if ext == '.pdf':
|
| 866 |
-
text
|
| 867 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 868 |
elif ext == '.pptx':
|
| 869 |
if 'Presentation' not in globals():
|
| 870 |
raise HTTPException(status_code=500, detail="La librairie python-pptx n'est pas installée.")
|
|
@@ -875,74 +766,29 @@ async def convert_file_to_txt(
|
|
| 875 |
if hasattr(shape, "text"):
|
| 876 |
text_content.append(shape.text)
|
| 877 |
text = "\n".join(text_content)
|
| 878 |
-
|
| 879 |
-
|
| 880 |
elif ext == '.ppt':
|
| 881 |
if 'textract' not in globals():
|
| 882 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
| 883 |
text = textract.process(input_filename).decode('utf-8', errors='replace')
|
| 884 |
-
|
| 885 |
-
|
| 886 |
elif ext == '.doc':
|
| 887 |
if 'textract' not in globals():
|
| 888 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
| 889 |
text = textract.process(input_filename).decode('utf-8', errors='replace')
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
logging.debug(f"Extraction DOC terminée. Texte extrait de {len(text)} caractères. Aucune image trouvée.")
|
| 893 |
-
else:
|
| 894 |
-
# Autres formats pris en charge par pandoc (sans extraction d'image)
|
| 895 |
-
pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
|
| 896 |
-
with open(output_filename, "r", encoding="utf-8") as f:
|
| 897 |
-
text = f.read()
|
| 898 |
-
images = []
|
| 899 |
-
logging.debug(f"Conversion avec Pandoc terminée. Texte extrait de {len(text)} caractères. Aucune image trouvée.")
|
| 900 |
-
|
| 901 |
-
# Analyse des images et récupération des descriptions
|
| 902 |
-
if images:
|
| 903 |
-
tasks = []
|
| 904 |
-
for img_num, img_bytes in images:
|
| 905 |
-
base64_image = base64.b64encode(img_bytes).decode('utf-8')
|
| 906 |
-
tasks.append(asyncio.create_task(
|
| 907 |
-
get_image_description(
|
| 908 |
-
base64_image,
|
| 909 |
-
prompt="Cette image est incluse dans un cours. Je voudrais que tu me donnes toutes les informations pertinentes, pour qu'on puisse comprendre ce qu'elle contient sans la voir. Ne commente pas les couleurs, les formes et la disposition. Ne commente pas le fait que tu décris l'image : fais en sorte que l'image puisse être naturellement remplacée par ta description. Si l'image ne contient aucune information, ne renvoie rien du tout."
|
| 910 |
-
)
|
| 911 |
-
))
|
| 912 |
-
logging.debug(f"Lancement de {len(tasks)} tâches pour la description des images.")
|
| 913 |
-
|
| 914 |
-
descriptions = await asyncio.gather(*tasks, return_exceptions=True)
|
| 915 |
-
|
| 916 |
-
for (img_num, _), desc in zip(images, descriptions):
|
| 917 |
-
if isinstance(desc, Exception):
|
| 918 |
-
logging.error(f"Erreur lors de la description de l'image {img_num} : {str(desc)}")
|
| 919 |
-
image_descriptions[img_num] = "Description indisponible."
|
| 920 |
-
elif desc and desc != "Description indisponible.":
|
| 921 |
-
image_descriptions[img_num] = desc
|
| 922 |
-
else:
|
| 923 |
-
image_descriptions[img_num] = "Description indisponible."
|
| 924 |
-
logging.debug(f"Descriptions des images terminées. {len(image_descriptions)} descriptions générées.")
|
| 925 |
-
|
| 926 |
-
# Remplacer les marqueurs par les descriptions
|
| 927 |
-
for img_num, desc in image_descriptions.items():
|
| 928 |
-
marker = f"[IMG_{img_num}]"
|
| 929 |
-
description_text = f"Image {img_num}: {desc}"
|
| 930 |
-
text = text.replace(marker, description_text)
|
| 931 |
-
logging.debug("Remplacement des marqueurs d'images par les descriptions terminé.")
|
| 932 |
else:
|
| 933 |
-
|
| 934 |
-
|
| 935 |
-
# Écriture du texte dans le fichier de sortie
|
| 936 |
-
with open(output_filename, "w", encoding="utf-8") as f:
|
| 937 |
-
f.write(text)
|
| 938 |
-
logging.debug(f"Écriture du fichier texte terminée : {output_filename}")
|
| 939 |
|
| 940 |
if not os.path.exists(output_filename):
|
| 941 |
logging.error(f"Le fichier {output_filename} n'a pas été généré.")
|
| 942 |
raise HTTPException(status_code=500, detail="Erreur lors de la conversion.")
|
| 943 |
|
| 944 |
temp_files_to_delete = [input_filename, output_filename]
|
| 945 |
-
if ext in ['.html', '.htm']
|
| 946 |
temp_files_to_delete.append(cleaned_input_filename)
|
| 947 |
background_tasks.add_task(delete_temp_files, temp_files_to_delete)
|
| 948 |
|
|
@@ -953,5 +799,4 @@ async def convert_file_to_txt(
|
|
| 953 |
return JSONResponse(status_code=http_exc.status_code, content={"message": http_exc.detail})
|
| 954 |
except Exception as e:
|
| 955 |
logging.error(f"Erreur interne lors de la conversion : {str(e)}")
|
| 956 |
-
return JSONResponse(status_code=500, content={"message": f"Erreur interne : {str(e)}"})
|
| 957 |
-
|
|
|
|
| 26 |
|
| 27 |
try:
|
| 28 |
from pptx import Presentation
|
|
|
|
| 29 |
except ImportError:
|
| 30 |
pass
|
| 31 |
|
|
|
|
| 90 |
status_data = json.load(f)
|
| 91 |
return status_data
|
| 92 |
|
| 93 |
+
def process_file(job_id: str, input_file_path: str, ext: str, original_filename: str):
|
| 94 |
job_dir = os.path.join(JOBS_DIR, job_id)
|
| 95 |
try:
|
| 96 |
update_job_status(job_id, 'processing', 'Le fichier est en cours de traitement')
|
|
|
|
| 100 |
base_filename = os.path.splitext(original_filename)[0]
|
| 101 |
output_filename = os.path.join(job_dir, f"{base_filename}.html")
|
| 102 |
|
| 103 |
+
loop = asyncio.new_event_loop()
|
| 104 |
+
asyncio.set_event_loop(loop)
|
| 105 |
+
try:
|
| 106 |
+
final_html = loop.run_until_complete(
|
| 107 |
+
convert_to_accessible_html(
|
| 108 |
+
input_file_path, ext, base_filename, image_counter, images_data
|
| 109 |
+
)
|
| 110 |
+
)
|
| 111 |
+
finally:
|
| 112 |
+
loop.close()
|
| 113 |
|
| 114 |
if not final_html:
|
| 115 |
update_job_status(job_id, 'error', 'Erreur lors de la conversion.')
|
|
|
|
| 145 |
async def convert_to_accessible_html(input_filename, ext, base_filename, image_counter, images_data):
|
| 146 |
try:
|
| 147 |
if ext == '.pdf':
|
| 148 |
+
# PDF -> HTML avec pages
|
| 149 |
html_content = pdf_to_html(input_filename)
|
| 150 |
+
# Pour le PDF, on a déjà des <!--PAGE_X--> par page, pas besoin d'en ajouter toutes les 20 lignes
|
| 151 |
elif ext in ['.ppt', '.pptx']:
|
| 152 |
# PPT/PPTX -> texte -> HTML minimal
|
| 153 |
text = convert_ppt_to_text(input_filename)
|
| 154 |
html_content = text_to_html(text)
|
| 155 |
+
# Ajouter les <!--PAGE_X--> toutes les 20 lignes pour ce format
|
| 156 |
html_content = insert_page_comments_every_20_paragraphs(html_content)
|
| 157 |
elif ext == '.doc':
|
| 158 |
# DOC -> texte (textract) -> HTML minimal
|
| 159 |
text = convert_doc_to_text(input_filename)
|
| 160 |
html_content = text_to_html(text)
|
| 161 |
html_content = insert_page_comments_every_20_paragraphs(html_content)
|
| 162 |
+
elif ext in ['.html', '.htm']:
|
| 163 |
+
with open(input_filename, 'r', encoding='utf-8') as f:
|
| 164 |
+
html_content = f.read()
|
| 165 |
+
try:
|
| 166 |
+
doc = Document(html_content)
|
| 167 |
+
html_content = doc.summary()
|
| 168 |
+
except Exception as e:
|
| 169 |
+
logging.error(f"Erreur lors du nettoyage HTML avec readability-lxml : {str(e)}")
|
| 170 |
+
# Ajouter les <!--PAGE_X--> toutes les 20 lignes
|
| 171 |
+
html_content = insert_page_comments_every_20_paragraphs(html_content)
|
| 172 |
else:
|
| 173 |
# Formats gérés par Pandoc
|
| 174 |
input_format = get_pandoc_format(ext)
|
| 175 |
html_content = convert_with_pandoc(input_filename, input_format)
|
| 176 |
+
# Ajouter les <!--PAGE_X--> toutes les 20 lignes
|
| 177 |
html_content = insert_page_comments_every_20_paragraphs(html_content)
|
| 178 |
|
| 179 |
# Nettoyage
|
|
|
|
| 183 |
html_rewrite_task = asyncio.create_task(rewrite_html_accessible(cleaned_html))
|
| 184 |
|
| 185 |
# Traitement des images (description)
|
|
|
|
| 186 |
for image_key in images_data:
|
| 187 |
base64_image = images_data[image_key]['base64_image']
|
| 188 |
+
description = await get_image_description(base64_image)
|
| 189 |
+
images_data[image_key]['description'] = description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
await html_rewrite_task
|
| 192 |
rewritten_html = html_rewrite_task.result()
|
|
|
|
| 215 |
return None
|
| 216 |
|
| 217 |
def insert_page_comments_every_20_paragraphs(html_content: str) -> str:
|
| 218 |
+
# Insère un commentaire <!--PAGE_X--> toutes les 20 balises <p>
|
| 219 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 220 |
paragraphs = soup.find_all('p')
|
| 221 |
page_number = 1
|
| 222 |
+
count = 0
|
| 223 |
for i, p in enumerate(paragraphs, start=1):
|
| 224 |
+
if i % 20 == 1: # Avant le premier <p> d'un "bloc"
|
| 225 |
comment = soup.new_string(f"<!--PAGE_{page_number}-->")
|
| 226 |
p.insert_before(comment)
|
| 227 |
page_number += 1
|
|
|
|
| 229 |
|
| 230 |
def insert_css_into_html(html_content: str) -> str:
|
| 231 |
css_code = """
|
|
|
|
| 232 |
:root {
|
| 233 |
--font-size-min: 1rem;
|
| 234 |
--font-size-base: 1rem;
|
|
|
|
| 400 |
html = re.sub(r'_(.*?)_', r'<i>\1</i>', html)
|
| 401 |
return html
|
| 402 |
|
| 403 |
+
async def get_image_description(base64_image: str) -> str:
|
|
|
|
|
|
|
| 404 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
response = await client.chat.completions.create(
|
| 406 |
model="gpt-4o-mini",
|
| 407 |
messages=[
|
| 408 |
{
|
| 409 |
"role": "user",
|
| 410 |
+
"content": [
|
| 411 |
+
{
|
| 412 |
+
"type": "text",
|
| 413 |
+
"text": "Décris ce que l'on peut voir sur cette image, pour qu'un lecteur malvoyant puisse comprendre ce qu'elle représente.",
|
| 414 |
+
},
|
| 415 |
+
{
|
| 416 |
+
"type": "image_url",
|
| 417 |
+
"image_url": {
|
| 418 |
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
| 419 |
+
},
|
| 420 |
+
},
|
| 421 |
+
],
|
| 422 |
}
|
| 423 |
],
|
| 424 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
description = response.choices[0].message.content.strip()
|
|
|
|
| 426 |
return description
|
| 427 |
except Exception as e:
|
| 428 |
logging.error(f"Erreur lors de l'appel à l'API OpenAI : {str(e)}")
|
| 429 |
return "Description indisponible."
|
| 430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
async def rewrite_html_accessible(html_content: str) -> str:
|
| 432 |
prompt = (
|
| 433 |
"Je vais te donner un fichier HTML, et je voudrais que tu le réécrives pour permettre l'accessibilité à toutes les formes de handicap, tout en **préservant strictement l'ordre du contenu original**.\n"
|
| 434 |
+
"Commence à analyser le plan du document. Il faut d'abord identifier les titres et comprendre leur logique :\n"
|
| 435 |
"- A priori, les titres qui sont préfixés par une écriture romaine (I, II, III), "
|
| 436 |
"par un nombre (1, 2, 3) ou par une lettre (a, b, c, ou bien A, B, C) doivent être de même niveau."
|
| 437 |
"Idem pour les titres rédigés en majuscules.\n"
|
|
|
|
| 469 |
response = await client.chat.completions.create(
|
| 470 |
model="o1-mini",
|
| 471 |
messages=[
|
| 472 |
+
{"role": "user", "content": prompt}
|
| 473 |
],
|
| 474 |
)
|
| 475 |
rewritten_html = response.choices[0].message.content.strip()
|
|
|
|
| 552 |
if image_key in images_data:
|
| 553 |
img_tag = soup.new_tag('img')
|
| 554 |
img_tag['src'] = f"data:image/jpeg;base64,{images_data[image_key]['base64_image']}"
|
| 555 |
+
img_tag['alt'] = images_data[image_key]['description']
|
| 556 |
|
| 557 |
new_content = soup.new_tag('div')
|
| 558 |
new_content.append(img_tag)
|
|
|
|
| 563 |
p_tag.append(strong_tag)
|
| 564 |
p_tag.append(" : ")
|
| 565 |
|
| 566 |
+
y_markdown = images_data[image_key]['description']
|
| 567 |
y_html = markdown_to_html(y_markdown)
|
| 568 |
y_soup = BeautifulSoup(y_html, 'html.parser')
|
| 569 |
p_tag.append(y_soup)
|
| 570 |
|
| 571 |
new_content.append(p_tag)
|
| 572 |
comment.replace_with(new_content)
|
|
|
|
| 573 |
else:
|
| 574 |
logging.error(f"Données pour {image_key} non trouvées.")
|
|
|
|
|
|
|
| 575 |
|
| 576 |
+
return str(soup)
|
| 577 |
|
| 578 |
def pdf_to_html(input_filename: str) -> str:
|
| 579 |
soup = BeautifulSoup("<html><head></head><body></body></html>", 'html.parser')
|
|
|
|
| 673 |
with open(status_file, 'w') as f:
|
| 674 |
json.dump(status, f)
|
| 675 |
|
|
|
|
| 676 |
background_tasks.add_task(process_file, job_id, input_file_path, ext, file.filename)
|
| 677 |
|
| 678 |
return JSONResponse(content={'job_id': job_id})
|
|
|
|
| 713 |
except Exception as e:
|
| 714 |
logging.error(f"Erreur lors de la suppression du fichier {file_path} : {str(e)}")
|
| 715 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 716 |
@app.post("/convert_to_txt/")
|
| 717 |
async def convert_file_to_txt(
|
| 718 |
file: UploadFile = File(...),
|
|
|
|
| 749 |
unique_id = uuid.uuid4().hex
|
| 750 |
output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
|
| 751 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 752 |
if ext == '.pdf':
|
| 753 |
+
text = ""
|
| 754 |
+
with fitz.open(input_filename) as doc:
|
| 755 |
+
for page in doc:
|
| 756 |
+
text += page.get_text()
|
| 757 |
+
with open(output_filename, "w", encoding="utf-8") as f:
|
| 758 |
+
f.write(text)
|
| 759 |
elif ext == '.pptx':
|
| 760 |
if 'Presentation' not in globals():
|
| 761 |
raise HTTPException(status_code=500, detail="La librairie python-pptx n'est pas installée.")
|
|
|
|
| 766 |
if hasattr(shape, "text"):
|
| 767 |
text_content.append(shape.text)
|
| 768 |
text = "\n".join(text_content)
|
| 769 |
+
with open(output_filename, "w", encoding="utf-8") as f:
|
| 770 |
+
f.write(text)
|
| 771 |
elif ext == '.ppt':
|
| 772 |
if 'textract' not in globals():
|
| 773 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
| 774 |
text = textract.process(input_filename).decode('utf-8', errors='replace')
|
| 775 |
+
with open(output_filename, "w", encoding="utf-8") as f:
|
| 776 |
+
f.write(text)
|
| 777 |
elif ext == '.doc':
|
| 778 |
if 'textract' not in globals():
|
| 779 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
| 780 |
text = textract.process(input_filename).decode('utf-8', errors='replace')
|
| 781 |
+
with open(output_filename, "w", encoding="utf-8") as f:
|
| 782 |
+
f.write(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 783 |
else:
|
| 784 |
+
output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 785 |
|
| 786 |
if not os.path.exists(output_filename):
|
| 787 |
logging.error(f"Le fichier {output_filename} n'a pas été généré.")
|
| 788 |
raise HTTPException(status_code=500, detail="Erreur lors de la conversion.")
|
| 789 |
|
| 790 |
temp_files_to_delete = [input_filename, output_filename]
|
| 791 |
+
if ext in ['.html', '.htm']:
|
| 792 |
temp_files_to_delete.append(cleaned_input_filename)
|
| 793 |
background_tasks.add_task(delete_temp_files, temp_files_to_delete)
|
| 794 |
|
|
|
|
| 799 |
return JSONResponse(status_code=http_exc.status_code, content={"message": http_exc.detail})
|
| 800 |
except Exception as e:
|
| 801 |
logging.error(f"Erreur interne lors de la conversion : {str(e)}")
|
| 802 |
+
return JSONResponse(status_code=500, content={"message": f"Erreur interne : {str(e)}"})
|
|
|