ConseillerAugmente / utils /document.py
feat: Add Markdown to HTML conversion for PDF generation
328456e
raw
history blame
7.95 kB
import pymupdf
import tiktoken
import markdown
import re
from io import BytesIO
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.enums import TA_CENTER
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
def count_tokens(input_string: str) -> int:
tokenizer = tiktoken.get_encoding("cl100k_base")
tokens = tokenizer.encode(input_string)
return len(tokens)
def audit_descriptif_pdf(file,max_img_width=500) -> dict:
document = pymupdf.open(stream=file.read())
audit_dict_doc = {
"number_of_pages": len(document),
"number_of_images": 0,
"number_of_links": 0,
"number_of_tables": 0,
"number_of_tokens": 0,
"number_of_words": 0,
"key_words": []
}
doc_content = dict()
for page in document:
audit_dict_page = {}
page_content = {
"images": [],
"texte": "",
"liens": [],
"tableaux": []
}
#number of images
images = page.get_images()
number_images = len(images)
audit_dict_page["number_of_images"] = number_images
audit_dict_doc["number_of_images"] += number_images
#get images
for _, img in enumerate(images):
xref = img[0]
base_image = document.extract_image(xref)
image_bytes = base_image["image"]
image_width = base_image["width"]
image_height = base_image["height"]
# Adjust image size if it exceeds the maximum width
if image_width > max_img_width:
ratio = max_img_width / image_width
image_width = max_img_width
image_height = int(image_height * ratio)
page_content["images"].append((image_bytes, image_width, image_height))
#get links with uri
links = []
for link in page.get_links():
if link['kind'] == pymupdf.LINK_URI and 'uri' in link:
links.append({"uri": link["uri"], "page": page.number})
page_content["liens"] = links
#number of links
number_links = len(links)
audit_dict_page["number_of_links"] = number_links
audit_dict_doc["number_of_links"] += number_links
#number of tables
tables = page.find_tables().tables
number_tables = len(tables)
for tab in tables:
page_content["tableaux"].append(tab.to_pandas())
audit_dict_page["number_of_tables"] = number_tables
audit_dict_doc["number_of_tables"] += number_tables
#number of tokens and words
text = page.get_text("text")
number_tokens = count_tokens(text)
number_words = len(text.split())
audit_dict_page["number_of_tokens"] = number_tokens
audit_dict_page["number_of_words"] = number_words
#get text
page_content["texte"] = text
audit_dict_doc["number_of_tokens"] += number_tokens
audit_dict_doc["number_of_words"] += number_words
audit_dict_doc[f"page_{page.number}"] = audit_dict_page
doc_content[f"page_{page.number}"] = page_content
#merge 2 dicts
global_audit = {
"audit": audit_dict_doc,
"content": doc_content
}
return global_audit
# Fonction pour convertir le Markdown en HTML pour le PDF
def markdown_to_html(md_text):
return markdown.markdown(md_text, output_format='html' )
html = """
<html>
<head>
<style>
body { font-family: Arial, sans-serif; margin: 40px; }
h1 { text-align: center; color: #333; }
h2 { border-bottom: 2px solid #666; padding-bottom: 5px; margin-top: 30px; }
.message { margin-bottom: 10px; padding: 10px; border-radius: 5px; }
.human { background-color: #e1f5fe; }
.ai { background-color: #e8f5e9; }
.system { background-color: #ffebee; }
</style>
</head>
<body>
<h1>Conseiller augmenté CEGARA</h1>
"""
# Trier les chapitres par "num"
chapters = sorted(chapter_data, key=lambda x: x["num"])
for chapter in chapters:
html += f"<h2>Chapitre {chapter['num']}: {chapter['title']}</h2>"
if len(chapter["messages"]) > 1 :
# Affichage des messages
for msg in chapter["messages"]:
if isinstance(msg, HumanMessage):
sender = "Utilisateur"
css_class = "human"
elif isinstance(msg, AIMessage):
sender = "IA"
css_class = "ai"
elif isinstance(msg, SystemMessage):
sender = "Système"
css_class = "system"
else:
sender = "Message"
css_class = ""
html += f"""
<div class="message {css_class}">
<b>{sender} :</b> {markdown_to_html(msg.content)}
</div>
"""
html += "</body></html>"
return generate_pdf_from_html(html)
# Fonction pour convertir le Markdown en texte enrichi compatible ReportLab
def markdown_to_reportlab(text):
# text = text.replace("**", "<b>").replace("__", "<b>") # Gras
# text = text.replace("*", "<i>").replace("_", "<i>") # Italique
text = text.replace("\n", "<br/>") # Italique
# text = re.sub(r"\n- (.+)", r"\n• \1", text) # Listes à puces
# text = re.sub(r"^# (.+)", r"<b><font size='16'>\1</font></b>", text, flags=re.MULTILINE) # Titre H1
# text = re.sub(r"^## (.+)", r"<b><font size='14'>\1</font></b>", text, flags=re.MULTILINE) # Titre H2
# text = re.sub(r"^### (.+)", r"<b><font size='12'>\1</font></b>", text, flags=re.MULTILINE) # Titre H3
return text
def generate_pdf(chapter_data: list):
buffer = BytesIO()
doc = SimpleDocTemplate(buffer, pagesize=A4)
styles = getSampleStyleSheet()
style_title = styles["Title"]
style_title.alignment = TA_CENTER # Centrer le titre
style_header = styles["Heading2"]
style_message = styles["BodyText"]
elements = []
# Titre principal du document
elements.append(Paragraph("Conseiller augmenté CEGARA", style_title))
elements.append(Spacer(1, 20)) # Espacement après le titre
# Trier les chapitres par "num"
chapters = sorted(chapter_data, key=lambda x: x["num"])
for chapter in chapters:
# Ajouter le titre de la discussion
elements.append(Paragraph(f"Chapitre {chapter['num']}: {chapter['title']}", style_header))
elements.append(Spacer(1, 10))
if len(chapter["messages"]) > 1 :
for msg in chapter["messages"]:
if isinstance(msg, HumanMessage):
color = "blue"
sender = "Utilisateur"
elif isinstance(msg, AIMessage):
color = "green"
sender = "Conseiller augmenté CEGARA"
elif isinstance(msg, SystemMessage):
color = "red"
sender = "Système"
else:
color = "black"
sender = "Message"
elements.append(Paragraph(f"<b><font color='{color}'>{sender}</font></b>", style_message))
content = msg.content
content = markdown_to_html(content)
content = markdown_to_reportlab(content)
elements.append(Paragraph(content, style_message))
elements.append(Spacer(1, 10))
elements.append(Spacer(1, 15)) # Espacement entre discussions
doc.build(elements)
buffer.seek(0)
return buffer