Spaces:
Sleeping
Sleeping
Nouvelle version : description d'image dans les imports simples
Browse files
main.py
CHANGED
@@ -26,6 +26,7 @@ from bs4 import BeautifulSoup, Comment
|
|
26 |
|
27 |
try:
|
28 |
from pptx import Presentation
|
|
|
29 |
except ImportError:
|
30 |
pass
|
31 |
|
@@ -147,12 +148,11 @@ async def convert_to_accessible_html(input_filename, ext, base_filename, image_c
|
|
147 |
if ext == '.pdf':
|
148 |
# PDF -> HTML avec pages
|
149 |
html_content = pdf_to_html(input_filename)
|
150 |
-
# Pour le PDF, on a déjà des <!--PAGE_X--> par page
|
151 |
elif ext in ['.ppt', '.pptx']:
|
152 |
# PPT/PPTX -> texte -> HTML minimal
|
153 |
text = convert_ppt_to_text(input_filename)
|
154 |
html_content = text_to_html(text)
|
155 |
-
# Ajouter les <!--PAGE_X--> toutes les 20 lignes pour ce format
|
156 |
html_content = insert_page_comments_every_20_paragraphs(html_content)
|
157 |
elif ext == '.doc':
|
158 |
# DOC -> texte (textract) -> HTML minimal
|
@@ -166,14 +166,12 @@ async def convert_to_accessible_html(input_filename, ext, base_filename, image_c
|
|
166 |
doc = Document(html_content)
|
167 |
html_content = doc.summary()
|
168 |
except Exception as e:
|
169 |
-
logging.error(f"Erreur lors du nettoyage HTML
|
170 |
-
# Ajouter les <!--PAGE_X--> toutes les 20 lignes
|
171 |
html_content = insert_page_comments_every_20_paragraphs(html_content)
|
172 |
else:
|
173 |
# Formats gérés par Pandoc
|
174 |
input_format = get_pandoc_format(ext)
|
175 |
html_content = convert_with_pandoc(input_filename, input_format)
|
176 |
-
# Ajouter les <!--PAGE_X--> toutes les 20 lignes
|
177 |
html_content = insert_page_comments_every_20_paragraphs(html_content)
|
178 |
|
179 |
# Nettoyage
|
@@ -183,9 +181,19 @@ async def convert_to_accessible_html(input_filename, ext, base_filename, image_c
|
|
183 |
html_rewrite_task = asyncio.create_task(rewrite_html_accessible(cleaned_html))
|
184 |
|
185 |
# Traitement des images (description)
|
|
|
186 |
for image_key in images_data:
|
187 |
base64_image = images_data[image_key]['base64_image']
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
images_data[image_key]['description'] = description
|
190 |
|
191 |
await html_rewrite_task
|
@@ -215,13 +223,11 @@ async def convert_to_accessible_html(input_filename, ext, base_filename, image_c
|
|
215 |
return None
|
216 |
|
217 |
def insert_page_comments_every_20_paragraphs(html_content: str) -> str:
|
218 |
-
# Insère un commentaire <!--PAGE_X--> toutes les 20 balises <p>
|
219 |
soup = BeautifulSoup(html_content, 'html.parser')
|
220 |
paragraphs = soup.find_all('p')
|
221 |
page_number = 1
|
222 |
-
count = 0
|
223 |
for i, p in enumerate(paragraphs, start=1):
|
224 |
-
if i % 20 == 1:
|
225 |
comment = soup.new_string(f"<!--PAGE_{page_number}-->")
|
226 |
p.insert_before(comment)
|
227 |
page_number += 1
|
@@ -400,7 +406,9 @@ def markdown_to_html(markdown_text: str) -> str:
|
|
400 |
html = re.sub(r'_(.*?)_', r'<i>\1</i>', html)
|
401 |
return html
|
402 |
|
403 |
-
|
|
|
|
|
404 |
try:
|
405 |
response = await client.chat.completions.create(
|
406 |
model="gpt-4o-mini",
|
@@ -410,7 +418,7 @@ async def get_image_description(base64_image: str) -> str:
|
|
410 |
"content": [
|
411 |
{
|
412 |
"type": "text",
|
413 |
-
"text":
|
414 |
},
|
415 |
{
|
416 |
"type": "image_url",
|
@@ -427,6 +435,7 @@ async def get_image_description(base64_image: str) -> str:
|
|
427 |
except Exception as e:
|
428 |
logging.error(f"Erreur lors de l'appel à l'API OpenAI : {str(e)}")
|
429 |
return "Description indisponible."
|
|
|
430 |
|
431 |
async def rewrite_html_accessible(html_content: str) -> str:
|
432 |
prompt = (
|
@@ -469,7 +478,7 @@ async def rewrite_html_accessible(html_content: str) -> str:
|
|
469 |
response = await client.chat.completions.create(
|
470 |
model="o1-mini",
|
471 |
messages=[
|
472 |
-
{"role": "user", "content": prompt}
|
473 |
],
|
474 |
)
|
475 |
rewritten_html = response.choices[0].message.content.strip()
|
@@ -713,6 +722,49 @@ def delete_temp_files(file_paths: list):
|
|
713 |
except Exception as e:
|
714 |
logging.error(f"Erreur lors de la suppression du fichier {file_path} : {str(e)}")
|
715 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
716 |
@app.post("/convert_to_txt/")
|
717 |
async def convert_file_to_txt(
|
718 |
file: UploadFile = File(...),
|
@@ -749,13 +801,15 @@ async def convert_file_to_txt(
|
|
749 |
unique_id = uuid.uuid4().hex
|
750 |
output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
|
751 |
|
|
|
|
|
|
|
752 |
if ext == '.pdf':
|
753 |
-
text = ""
|
754 |
with fitz.open(input_filename) as doc:
|
755 |
for page in doc:
|
756 |
text += page.get_text()
|
757 |
-
|
758 |
-
|
759 |
elif ext == '.pptx':
|
760 |
if 'Presentation' not in globals():
|
761 |
raise HTTPException(status_code=500, detail="La librairie python-pptx n'est pas installée.")
|
@@ -766,29 +820,60 @@ async def convert_file_to_txt(
|
|
766 |
if hasattr(shape, "text"):
|
767 |
text_content.append(shape.text)
|
768 |
text = "\n".join(text_content)
|
769 |
-
|
770 |
-
f.write(text)
|
771 |
elif ext == '.ppt':
|
772 |
if 'textract' not in globals():
|
773 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
774 |
text = textract.process(input_filename).decode('utf-8', errors='replace')
|
775 |
-
|
776 |
-
f.write(text)
|
777 |
elif ext == '.doc':
|
778 |
if 'textract' not in globals():
|
779 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
780 |
text = textract.process(input_filename).decode('utf-8', errors='replace')
|
781 |
-
|
782 |
-
|
783 |
else:
|
784 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
785 |
|
786 |
if not os.path.exists(output_filename):
|
787 |
logging.error(f"Le fichier {output_filename} n'a pas été généré.")
|
788 |
raise HTTPException(status_code=500, detail="Erreur lors de la conversion.")
|
789 |
|
790 |
temp_files_to_delete = [input_filename, output_filename]
|
791 |
-
if ext in ['.html', '.htm']:
|
792 |
temp_files_to_delete.append(cleaned_input_filename)
|
793 |
background_tasks.add_task(delete_temp_files, temp_files_to_delete)
|
794 |
|
|
|
26 |
|
27 |
try:
|
28 |
from pptx import Presentation
|
29 |
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
30 |
except ImportError:
|
31 |
pass
|
32 |
|
|
|
148 |
if ext == '.pdf':
|
149 |
# PDF -> HTML avec pages
|
150 |
html_content = pdf_to_html(input_filename)
|
151 |
+
# Pour le PDF, on a déjà des <!--PAGE_X--> par page
|
152 |
elif ext in ['.ppt', '.pptx']:
|
153 |
# PPT/PPTX -> texte -> HTML minimal
|
154 |
text = convert_ppt_to_text(input_filename)
|
155 |
html_content = text_to_html(text)
|
|
|
156 |
html_content = insert_page_comments_every_20_paragraphs(html_content)
|
157 |
elif ext == '.doc':
|
158 |
# DOC -> texte (textract) -> HTML minimal
|
|
|
166 |
doc = Document(html_content)
|
167 |
html_content = doc.summary()
|
168 |
except Exception as e:
|
169 |
+
logging.error(f"Erreur lors du nettoyage HTML : {str(e)}")
|
|
|
170 |
html_content = insert_page_comments_every_20_paragraphs(html_content)
|
171 |
else:
|
172 |
# Formats gérés par Pandoc
|
173 |
input_format = get_pandoc_format(ext)
|
174 |
html_content = convert_with_pandoc(input_filename, input_format)
|
|
|
175 |
html_content = insert_page_comments_every_20_paragraphs(html_content)
|
176 |
|
177 |
# Nettoyage
|
|
|
181 |
html_rewrite_task = asyncio.create_task(rewrite_html_accessible(cleaned_html))
|
182 |
|
183 |
# Traitement des images (description)
|
184 |
+
tasks = []
|
185 |
for image_key in images_data:
|
186 |
base64_image = images_data[image_key]['base64_image']
|
187 |
+
tasks.append((image_key, asyncio.create_task(
|
188 |
+
get_image_description(
|
189 |
+
base64_image,
|
190 |
+
prompt="Décris ce que l'on peut voir sur cette image, pour qu'un lecteur malvoyant puisse comprendre ce qu'elle représente."
|
191 |
+
)
|
192 |
+
)))
|
193 |
+
|
194 |
+
results = await asyncio.gather(*(t for _, t in tasks))
|
195 |
+
|
196 |
+
for (image_key, _), description in zip(tasks, results):
|
197 |
images_data[image_key]['description'] = description
|
198 |
|
199 |
await html_rewrite_task
|
|
|
223 |
return None
|
224 |
|
225 |
def insert_page_comments_every_20_paragraphs(html_content: str) -> str:
|
|
|
226 |
soup = BeautifulSoup(html_content, 'html.parser')
|
227 |
paragraphs = soup.find_all('p')
|
228 |
page_number = 1
|
|
|
229 |
for i, p in enumerate(paragraphs, start=1):
|
230 |
+
if i % 20 == 1:
|
231 |
comment = soup.new_string(f"<!--PAGE_{page_number}-->")
|
232 |
p.insert_before(comment)
|
233 |
page_number += 1
|
|
|
406 |
html = re.sub(r'_(.*?)_', r'<i>\1</i>', html)
|
407 |
return html
|
408 |
|
409 |
+
# MODIFICATIONS START
|
410 |
+
# On rend le prompt pour get_image_description paramétrable
|
411 |
+
async def get_image_description(base64_image: str, prompt: str) -> str:
|
412 |
try:
|
413 |
response = await client.chat.completions.create(
|
414 |
model="gpt-4o-mini",
|
|
|
418 |
"content": [
|
419 |
{
|
420 |
"type": "text",
|
421 |
+
"text": prompt,
|
422 |
},
|
423 |
{
|
424 |
"type": "image_url",
|
|
|
435 |
except Exception as e:
|
436 |
logging.error(f"Erreur lors de l'appel à l'API OpenAI : {str(e)}")
|
437 |
return "Description indisponible."
|
438 |
+
# MODIFICATIONS END
|
439 |
|
440 |
async def rewrite_html_accessible(html_content: str) -> str:
|
441 |
prompt = (
|
|
|
478 |
response = await client.chat.completions.create(
|
479 |
model="o1-mini",
|
480 |
messages=[
|
481 |
+
{"role": "user", "content": prompt + html_content}
|
482 |
],
|
483 |
)
|
484 |
rewritten_html = response.choices[0].message.content.strip()
|
|
|
722 |
except Exception as e:
|
723 |
logging.error(f"Erreur lors de la suppression du fichier {file_path} : {str(e)}")
|
724 |
|
725 |
+
# MODIFICATIONS START
|
726 |
+
def extract_images_from_pdf(input_filename: str) -> List[bytes]:
|
727 |
+
images = []
|
728 |
+
with fitz.open(input_filename) as doc:
|
729 |
+
smask_xrefs = set() # Servira à lister les xrefs d'images masques
|
730 |
+
main_images = [] # Servira à lister les infos des images principales
|
731 |
+
|
732 |
+
for page in doc:
|
733 |
+
img_list = page.get_images(full=True)
|
734 |
+
for img in img_list:
|
735 |
+
xref, smask, width, height, bpc, colorspace, filters = img[:7]
|
736 |
+
# Si smask != 0, c'est une image principale avec un masque
|
737 |
+
# On ajoute le xref du masque à la liste des images à ignorer
|
738 |
+
if smask != 0:
|
739 |
+
smask_xrefs.add(smask)
|
740 |
+
# On enregistre l'image principale
|
741 |
+
main_images.append((xref, smask))
|
742 |
+
|
743 |
+
# Maintenant on extrait uniquement les images qui ne sont pas des masques
|
744 |
+
for (xref, smask) in main_images:
|
745 |
+
# Si xref est dans smask_xrefs, c'est une image de masque à ignorer
|
746 |
+
if xref in smask_xrefs:
|
747 |
+
continue
|
748 |
+
# Extraire l'image
|
749 |
+
base_image = doc.extract_image(xref)
|
750 |
+
image_bytes = base_image["image"]
|
751 |
+
images.append(image_bytes)
|
752 |
+
return images
|
753 |
+
|
754 |
+
def extract_images_from_ppt(input_filename: str) -> List[bytes]:
|
755 |
+
images = []
|
756 |
+
if 'Presentation' not in globals():
|
757 |
+
return images
|
758 |
+
prs = Presentation(input_filename)
|
759 |
+
for slide in prs.slides:
|
760 |
+
for shape in slide.shapes:
|
761 |
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
762 |
+
image = shape.image
|
763 |
+
image_bytes = image.blob
|
764 |
+
images.append(image_bytes)
|
765 |
+
return images
|
766 |
+
# MODIFICATIONS END
|
767 |
+
|
768 |
@app.post("/convert_to_txt/")
|
769 |
async def convert_file_to_txt(
|
770 |
file: UploadFile = File(...),
|
|
|
801 |
unique_id = uuid.uuid4().hex
|
802 |
output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
|
803 |
|
804 |
+
text = ""
|
805 |
+
# MODIFICATIONS START: Extraction du texte et des images
|
806 |
+
images_data = []
|
807 |
if ext == '.pdf':
|
|
|
808 |
with fitz.open(input_filename) as doc:
|
809 |
for page in doc:
|
810 |
text += page.get_text()
|
811 |
+
# Extraire les images du PDF
|
812 |
+
images = extract_images_from_pdf(input_filename)
|
813 |
elif ext == '.pptx':
|
814 |
if 'Presentation' not in globals():
|
815 |
raise HTTPException(status_code=500, detail="La librairie python-pptx n'est pas installée.")
|
|
|
820 |
if hasattr(shape, "text"):
|
821 |
text_content.append(shape.text)
|
822 |
text = "\n".join(text_content)
|
823 |
+
images = extract_images_from_ppt(input_filename)
|
|
|
824 |
elif ext == '.ppt':
|
825 |
if 'textract' not in globals():
|
826 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
827 |
text = textract.process(input_filename).decode('utf-8', errors='replace')
|
828 |
+
images = extract_images_from_ppt(input_filename)
|
|
|
829 |
elif ext == '.doc':
|
830 |
if 'textract' not in globals():
|
831 |
raise HTTPException(status_code=500, detail="La librairie textract n'est pas installée.")
|
832 |
text = textract.process(input_filename).decode('utf-8', errors='replace')
|
833 |
+
# Pas d'extraction d'images simple pour .doc ici
|
834 |
+
images = []
|
835 |
else:
|
836 |
+
# Autres formats pris en charge par pandoc (sans extraction d'image)
|
837 |
+
pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
|
838 |
+
with open(output_filename, "r", encoding="utf-8") as f:
|
839 |
+
text = f.read()
|
840 |
+
images = []
|
841 |
+
|
842 |
+
# Analyse des images
|
843 |
+
# On récupère les descriptions des images
|
844 |
+
# Le prompt demandé : "Cette image est incluse dans un cours. Je voudrais que tu me donnes toutes les informations pertinentes..."
|
845 |
+
if images:
|
846 |
+
image_descriptions = []
|
847 |
+
tasks = []
|
848 |
+
for i, img_bytes in enumerate(images, start=1):
|
849 |
+
base64_image = base64.b64encode(img_bytes).decode('utf-8')
|
850 |
+
tasks.append((i, asyncio.create_task(
|
851 |
+
get_image_description(
|
852 |
+
base64_image,
|
853 |
+
prompt="Cette image est incluse dans un cours. Je voudrais que tu me donnes toutes les informations pertinentes, pour qu'on puisse comprendre ce qu'elle contient sans la voir. Ne commente pas les couleurs, les formes et la disposition. Ne commente pas le fait que tu décris l'image : fais en sorte que l'image puisse être naturellement remplacée par ta description. Si l'image ne contient aucune information, ne renvoie rien du tout."
|
854 |
+
)
|
855 |
+
)))
|
856 |
+
|
857 |
+
results = await asyncio.gather(*(t for _, t in tasks))
|
858 |
+
|
859 |
+
for (i, _), description in zip(tasks, results):
|
860 |
+
image_descriptions.append((i, description))
|
861 |
+
|
862 |
+
# On ajoute les descriptions à la fin du texte
|
863 |
+
text += "\n\n---\n"
|
864 |
+
for num, desc in image_descriptions:
|
865 |
+
text += f"\nImage {num} : {desc}\n"
|
866 |
+
# MODIFICATIONS END
|
867 |
+
|
868 |
+
with open(output_filename, "w", encoding="utf-8") as f:
|
869 |
+
f.write(text)
|
870 |
|
871 |
if not os.path.exists(output_filename):
|
872 |
logging.error(f"Le fichier {output_filename} n'a pas été généré.")
|
873 |
raise HTTPException(status_code=500, detail="Erreur lors de la conversion.")
|
874 |
|
875 |
temp_files_to_delete = [input_filename, output_filename]
|
876 |
+
if ext in ['.html', '.htm'] and 'cleaned_input_filename' in locals():
|
877 |
temp_files_to_delete.append(cleaned_input_filename)
|
878 |
background_tasks.add_task(delete_temp_files, temp_files_to_delete)
|
879 |
|