Spaces:
Running
Running
File size: 7,947 Bytes
b10792b 328456e b10792b 1d4c573 b10792b 1d4c573 328456e 1d4c573 328456e 1d4c573 328456e 1d4c573 328456e 1d4c573 328456e 1d4c573 328456e 1d4c573 |
|
import pymupdf
import tiktoken
import markdown
import re
from io import BytesIO
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.enums import TA_CENTER
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
def count_tokens(input_string: str) -> int:
tokenizer = tiktoken.get_encoding("cl100k_base")
tokens = tokenizer.encode(input_string)
return len(tokens)
def audit_descriptif_pdf(file,max_img_width=500) -> dict:
document = pymupdf.open(stream=file.read())
audit_dict_doc = {
"number_of_pages": len(document),
"number_of_images": 0,
"number_of_links": 0,
"number_of_tables": 0,
"number_of_tokens": 0,
"number_of_words": 0,
"key_words": []
}
doc_content = dict()
for page in document:
audit_dict_page = {}
page_content = {
"images": [],
"texte": "",
"liens": [],
"tableaux": []
}
#number of images
images = page.get_images()
number_images = len(images)
audit_dict_page["number_of_images"] = number_images
audit_dict_doc["number_of_images"] += number_images
#get images
for _, img in enumerate(images):
xref = img[0]
base_image = document.extract_image(xref)
image_bytes = base_image["image"]
image_width = base_image["width"]
image_height = base_image["height"]
# Adjust image size if it exceeds the maximum width
if image_width > max_img_width:
ratio = max_img_width / image_width
image_width = max_img_width
image_height = int(image_height * ratio)
page_content["images"].append((image_bytes, image_width, image_height))
#get links with uri
links = []
for link in page.get_links():
if link['kind'] == pymupdf.LINK_URI and 'uri' in link:
links.append({"uri": link["uri"], "page": page.number})
page_content["liens"] = links
#number of links
number_links = len(links)
audit_dict_page["number_of_links"] = number_links
audit_dict_doc["number_of_links"] += number_links
#number of tables
tables = page.find_tables().tables
number_tables = len(tables)
for tab in tables:
page_content["tableaux"].append(tab.to_pandas())
audit_dict_page["number_of_tables"] = number_tables
audit_dict_doc["number_of_tables"] += number_tables
#number of tokens and words
text = page.get_text("text")
number_tokens = count_tokens(text)
number_words = len(text.split())
audit_dict_page["number_of_tokens"] = number_tokens
audit_dict_page["number_of_words"] = number_words
#get text
page_content["texte"] = text
audit_dict_doc["number_of_tokens"] += number_tokens
audit_dict_doc["number_of_words"] += number_words
audit_dict_doc[f"page_{page.number}"] = audit_dict_page
doc_content[f"page_{page.number}"] = page_content
#merge 2 dicts
global_audit = {
"audit": audit_dict_doc,
"content": doc_content
}
return global_audit
# Fonction pour convertir le Markdown en HTML pour le PDF
def markdown_to_html(md_text):
return markdown.markdown(md_text, output_format='html' )
html = """
<html>
<head>
<style>
body { font-family: Arial, sans-serif; margin: 40px; }
h1 { text-align: center; color: #333; }
h2 { border-bottom: 2px solid #666; padding-bottom: 5px; margin-top: 30px; }
.message { margin-bottom: 10px; padding: 10px; border-radius: 5px; }
.human { background-color: #e1f5fe; }
.ai { background-color: #e8f5e9; }
.system { background-color: #ffebee; }
</style>
</head>
<body>
<h1>Conseiller augmenté CEGARA</h1>
"""
# Trier les chapitres par "num"
chapters = sorted(chapter_data, key=lambda x: x["num"])
for chapter in chapters:
html += f"<h2>Chapitre {chapter['num']}: {chapter['title']}</h2>"
if len(chapter["messages"]) > 1 :
# Affichage des messages
for msg in chapter["messages"]:
if isinstance(msg, HumanMessage):
sender = "Utilisateur"
css_class = "human"
elif isinstance(msg, AIMessage):
sender = "IA"
css_class = "ai"
elif isinstance(msg, SystemMessage):
sender = "Système"
css_class = "system"
else:
sender = "Message"
css_class = ""
html += f"""
<div class="message {css_class}">
<b>{sender} :</b> {markdown_to_html(msg.content)}
</div>
"""
html += "</body></html>"
return generate_pdf_from_html(html)
# Fonction pour convertir le Markdown en texte enrichi compatible ReportLab
def markdown_to_reportlab(text):
# text = text.replace("**", "<b>").replace("__", "<b>") # Gras
# text = text.replace("*", "<i>").replace("_", "<i>") # Italique
text = text.replace("\n", "<br/>") # Italique
# text = re.sub(r"\n- (.+)", r"\n• \1", text) # Listes à puces
# text = re.sub(r"^# (.+)", r"<b><font size='16'>\1</font></b>", text, flags=re.MULTILINE) # Titre H1
# text = re.sub(r"^## (.+)", r"<b><font size='14'>\1</font></b>", text, flags=re.MULTILINE) # Titre H2
# text = re.sub(r"^### (.+)", r"<b><font size='12'>\1</font></b>", text, flags=re.MULTILINE) # Titre H3
return text
def generate_pdf(chapter_data: list):
buffer = BytesIO()
doc = SimpleDocTemplate(buffer, pagesize=A4)
styles = getSampleStyleSheet()
style_title = styles["Title"]
style_title.alignment = TA_CENTER # Centrer le titre
style_header = styles["Heading2"]
style_message = styles["BodyText"]
elements = []
# Titre principal du document
elements.append(Paragraph("Conseiller augmenté CEGARA", style_title))
elements.append(Spacer(1, 20)) # Espacement après le titre
# Trier les chapitres par "num"
chapters = sorted(chapter_data, key=lambda x: x["num"])
for chapter in chapters:
# Ajouter le titre de la discussion
elements.append(Paragraph(f"Chapitre {chapter['num']}: {chapter['title']}", style_header))
elements.append(Spacer(1, 10))
if len(chapter["messages"]) > 1 :
for msg in chapter["messages"]:
if isinstance(msg, HumanMessage):
color = "blue"
sender = "Utilisateur"
elif isinstance(msg, AIMessage):
color = "green"
sender = "Conseiller augmenté CEGARA"
elif isinstance(msg, SystemMessage):
color = "red"
sender = "Système"
else:
color = "black"
sender = "Message"
elements.append(Paragraph(f"<b><font color='{color}'>{sender}</font></b>", style_message))
content = msg.content
content = markdown_to_html(content)
content = markdown_to_reportlab(content)
elements.append(Paragraph(content, style_message))
elements.append(Spacer(1, 10))
elements.append(Spacer(1, 15)) # Espacement entre discussions
doc.build(elements)
buffer.seek(0)
return buffer |