Spaces:
Running
Running
import pymupdf | |
import tiktoken | |
import markdown | |
import re | |
from io import BytesIO | |
from reportlab.lib.pagesizes import A4 | |
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer | |
from reportlab.lib.styles import getSampleStyleSheet | |
from reportlab.lib.enums import TA_CENTER | |
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage | |
def count_tokens(input_string: str) -> int: | |
tokenizer = tiktoken.get_encoding("cl100k_base") | |
tokens = tokenizer.encode(input_string) | |
return len(tokens) | |
def audit_descriptif_pdf(file,max_img_width=500) -> dict: | |
document = pymupdf.open(stream=file.read()) | |
audit_dict_doc = { | |
"number_of_pages": len(document), | |
"number_of_images": 0, | |
"number_of_links": 0, | |
"number_of_tables": 0, | |
"number_of_tokens": 0, | |
"number_of_words": 0, | |
"key_words": [] | |
} | |
doc_content = dict() | |
for page in document: | |
audit_dict_page = {} | |
page_content = { | |
"images": [], | |
"texte": "", | |
"liens": [], | |
"tableaux": [] | |
} | |
#number of images | |
images = page.get_images() | |
number_images = len(images) | |
audit_dict_page["number_of_images"] = number_images | |
audit_dict_doc["number_of_images"] += number_images | |
#get images | |
for _, img in enumerate(images): | |
xref = img[0] | |
base_image = document.extract_image(xref) | |
image_bytes = base_image["image"] | |
image_width = base_image["width"] | |
image_height = base_image["height"] | |
# Adjust image size if it exceeds the maximum width | |
if image_width > max_img_width: | |
ratio = max_img_width / image_width | |
image_width = max_img_width | |
image_height = int(image_height * ratio) | |
page_content["images"].append((image_bytes, image_width, image_height)) | |
#get links with uri | |
links = [] | |
for link in page.get_links(): | |
if link['kind'] == pymupdf.LINK_URI and 'uri' in link: | |
links.append({"uri": link["uri"], "page": page.number}) | |
page_content["liens"] = links | |
#number of links | |
number_links = len(links) | |
audit_dict_page["number_of_links"] = number_links | |
audit_dict_doc["number_of_links"] += number_links | |
#number of tables | |
tables = page.find_tables().tables | |
number_tables = len(tables) | |
for tab in tables: | |
page_content["tableaux"].append(tab.to_pandas()) | |
audit_dict_page["number_of_tables"] = number_tables | |
audit_dict_doc["number_of_tables"] += number_tables | |
#number of tokens and words | |
text = page.get_text("text") | |
number_tokens = count_tokens(text) | |
number_words = len(text.split()) | |
audit_dict_page["number_of_tokens"] = number_tokens | |
audit_dict_page["number_of_words"] = number_words | |
#get text | |
page_content["texte"] = text | |
audit_dict_doc["number_of_tokens"] += number_tokens | |
audit_dict_doc["number_of_words"] += number_words | |
audit_dict_doc[f"page_{page.number}"] = audit_dict_page | |
doc_content[f"page_{page.number}"] = page_content | |
#merge 2 dicts | |
global_audit = { | |
"audit": audit_dict_doc, | |
"content": doc_content | |
} | |
return global_audit | |
# Fonction pour convertir le Markdown en HTML pour le PDF | |
def markdown_to_html(md_text): | |
return markdown.markdown(md_text, output_format='html' ) | |
html = """ | |
<html> | |
<head> | |
<style> | |
body { font-family: Arial, sans-serif; margin: 40px; } | |
h1 { text-align: center; color: #333; } | |
h2 { border-bottom: 2px solid #666; padding-bottom: 5px; margin-top: 30px; } | |
.message { margin-bottom: 10px; padding: 10px; border-radius: 5px; } | |
.human { background-color: #e1f5fe; } | |
.ai { background-color: #e8f5e9; } | |
.system { background-color: #ffebee; } | |
</style> | |
</head> | |
<body> | |
<h1>Conseiller augmenté CEGARA</h1> | |
""" | |
# Trier les chapitres par "num" | |
chapters = sorted(chapter_data, key=lambda x: x["num"]) | |
for chapter in chapters: | |
html += f"<h2>Chapitre {chapter['num']}: {chapter['title']}</h2>" | |
if len(chapter["messages"]) > 1 : | |
# Affichage des messages | |
for msg in chapter["messages"]: | |
if isinstance(msg, HumanMessage): | |
sender = "Utilisateur" | |
css_class = "human" | |
elif isinstance(msg, AIMessage): | |
sender = "IA" | |
css_class = "ai" | |
elif isinstance(msg, SystemMessage): | |
sender = "Système" | |
css_class = "system" | |
else: | |
sender = "Message" | |
css_class = "" | |
html += f""" | |
<div class="message {css_class}"> | |
<b>{sender} :</b> {markdown_to_html(msg.content)} | |
</div> | |
""" | |
html += "</body></html>" | |
return generate_pdf_from_html(html) | |
# Fonction pour convertir le Markdown en texte enrichi compatible ReportLab | |
def markdown_to_reportlab(text): | |
# text = text.replace("**", "<b>").replace("__", "<b>") # Gras | |
# text = text.replace("*", "<i>").replace("_", "<i>") # Italique | |
text = text.replace("\n", "<br/>") # Italique | |
# text = re.sub(r"\n- (.+)", r"\n• \1", text) # Listes à puces | |
# text = re.sub(r"^# (.+)", r"<b><font size='16'>\1</font></b>", text, flags=re.MULTILINE) # Titre H1 | |
# text = re.sub(r"^## (.+)", r"<b><font size='14'>\1</font></b>", text, flags=re.MULTILINE) # Titre H2 | |
# text = re.sub(r"^### (.+)", r"<b><font size='12'>\1</font></b>", text, flags=re.MULTILINE) # Titre H3 | |
return text | |
def generate_pdf(chapter_data: list): | |
buffer = BytesIO() | |
doc = SimpleDocTemplate(buffer, pagesize=A4) | |
styles = getSampleStyleSheet() | |
style_title = styles["Title"] | |
style_title.alignment = TA_CENTER # Centrer le titre | |
style_header = styles["Heading2"] | |
style_message = styles["BodyText"] | |
elements = [] | |
# Titre principal du document | |
elements.append(Paragraph("Conseiller augmenté CEGARA", style_title)) | |
elements.append(Spacer(1, 20)) # Espacement après le titre | |
# Trier les chapitres par "num" | |
chapters = sorted(chapter_data, key=lambda x: x["num"]) | |
for chapter in chapters: | |
# Ajouter le titre de la discussion | |
elements.append(Paragraph(f"Chapitre {chapter['num']}: {chapter['title']}", style_header)) | |
elements.append(Spacer(1, 10)) | |
if len(chapter["messages"]) > 1 : | |
for msg in chapter["messages"]: | |
if isinstance(msg, HumanMessage): | |
color = "blue" | |
sender = "Utilisateur" | |
elif isinstance(msg, AIMessage): | |
color = "green" | |
sender = "Conseiller augmenté CEGARA" | |
elif isinstance(msg, SystemMessage): | |
color = "red" | |
sender = "Système" | |
else: | |
color = "black" | |
sender = "Message" | |
elements.append(Paragraph(f"<b><font color='{color}'>{sender}</font></b>", style_message)) | |
content = msg.content | |
content = markdown_to_html(content) | |
content = markdown_to_reportlab(content) | |
elements.append(Paragraph(content, style_message)) | |
elements.append(Spacer(1, 10)) | |
elements.append(Spacer(1, 15)) # Espacement entre discussions | |
doc.build(elements) | |
buffer.seek(0) | |
return buffer |