Spaces:
Running
Running
File size: 7,947 Bytes
b10792b 328456e b10792b 1d4c573 b10792b 1d4c573 328456e 1d4c573 328456e 1d4c573 328456e 1d4c573 328456e 1d4c573 328456e 1d4c573 328456e 1d4c573 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 |
import pymupdf
import tiktoken
import markdown
import re
from io import BytesIO
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.enums import TA_CENTER
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
def count_tokens(input_string: str) -> int:
tokenizer = tiktoken.get_encoding("cl100k_base")
tokens = tokenizer.encode(input_string)
return len(tokens)
def audit_descriptif_pdf(file,max_img_width=500) -> dict:
document = pymupdf.open(stream=file.read())
audit_dict_doc = {
"number_of_pages": len(document),
"number_of_images": 0,
"number_of_links": 0,
"number_of_tables": 0,
"number_of_tokens": 0,
"number_of_words": 0,
"key_words": []
}
doc_content = dict()
for page in document:
audit_dict_page = {}
page_content = {
"images": [],
"texte": "",
"liens": [],
"tableaux": []
}
#number of images
images = page.get_images()
number_images = len(images)
audit_dict_page["number_of_images"] = number_images
audit_dict_doc["number_of_images"] += number_images
#get images
for _, img in enumerate(images):
xref = img[0]
base_image = document.extract_image(xref)
image_bytes = base_image["image"]
image_width = base_image["width"]
image_height = base_image["height"]
# Adjust image size if it exceeds the maximum width
if image_width > max_img_width:
ratio = max_img_width / image_width
image_width = max_img_width
image_height = int(image_height * ratio)
page_content["images"].append((image_bytes, image_width, image_height))
#get links with uri
links = []
for link in page.get_links():
if link['kind'] == pymupdf.LINK_URI and 'uri' in link:
links.append({"uri": link["uri"], "page": page.number})
page_content["liens"] = links
#number of links
number_links = len(links)
audit_dict_page["number_of_links"] = number_links
audit_dict_doc["number_of_links"] += number_links
#number of tables
tables = page.find_tables().tables
number_tables = len(tables)
for tab in tables:
page_content["tableaux"].append(tab.to_pandas())
audit_dict_page["number_of_tables"] = number_tables
audit_dict_doc["number_of_tables"] += number_tables
#number of tokens and words
text = page.get_text("text")
number_tokens = count_tokens(text)
number_words = len(text.split())
audit_dict_page["number_of_tokens"] = number_tokens
audit_dict_page["number_of_words"] = number_words
#get text
page_content["texte"] = text
audit_dict_doc["number_of_tokens"] += number_tokens
audit_dict_doc["number_of_words"] += number_words
audit_dict_doc[f"page_{page.number}"] = audit_dict_page
doc_content[f"page_{page.number}"] = page_content
#merge 2 dicts
global_audit = {
"audit": audit_dict_doc,
"content": doc_content
}
return global_audit
# Fonction pour convertir le Markdown en HTML pour le PDF
def markdown_to_html(md_text):
return markdown.markdown(md_text, output_format='html' )
html = """
<html>
<head>
<style>
body { font-family: Arial, sans-serif; margin: 40px; }
h1 { text-align: center; color: #333; }
h2 { border-bottom: 2px solid #666; padding-bottom: 5px; margin-top: 30px; }
.message { margin-bottom: 10px; padding: 10px; border-radius: 5px; }
.human { background-color: #e1f5fe; }
.ai { background-color: #e8f5e9; }
.system { background-color: #ffebee; }
</style>
</head>
<body>
<h1>Conseiller augmenté CEGARA</h1>
"""
# Trier les chapitres par "num"
chapters = sorted(chapter_data, key=lambda x: x["num"])
for chapter in chapters:
html += f"<h2>Chapitre {chapter['num']}: {chapter['title']}</h2>"
if len(chapter["messages"]) > 1 :
# Affichage des messages
for msg in chapter["messages"]:
if isinstance(msg, HumanMessage):
sender = "Utilisateur"
css_class = "human"
elif isinstance(msg, AIMessage):
sender = "IA"
css_class = "ai"
elif isinstance(msg, SystemMessage):
sender = "Système"
css_class = "system"
else:
sender = "Message"
css_class = ""
html += f"""
<div class="message {css_class}">
<b>{sender} :</b> {markdown_to_html(msg.content)}
</div>
"""
html += "</body></html>"
return generate_pdf_from_html(html)
# Fonction pour convertir le Markdown en texte enrichi compatible ReportLab
def markdown_to_reportlab(text):
# text = text.replace("**", "<b>").replace("__", "<b>") # Gras
# text = text.replace("*", "<i>").replace("_", "<i>") # Italique
text = text.replace("\n", "<br/>") # Italique
# text = re.sub(r"\n- (.+)", r"\n• \1", text) # Listes à puces
# text = re.sub(r"^# (.+)", r"<b><font size='16'>\1</font></b>", text, flags=re.MULTILINE) # Titre H1
# text = re.sub(r"^## (.+)", r"<b><font size='14'>\1</font></b>", text, flags=re.MULTILINE) # Titre H2
# text = re.sub(r"^### (.+)", r"<b><font size='12'>\1</font></b>", text, flags=re.MULTILINE) # Titre H3
return text
def generate_pdf(chapter_data: list):
buffer = BytesIO()
doc = SimpleDocTemplate(buffer, pagesize=A4)
styles = getSampleStyleSheet()
style_title = styles["Title"]
style_title.alignment = TA_CENTER # Centrer le titre
style_header = styles["Heading2"]
style_message = styles["BodyText"]
elements = []
# Titre principal du document
elements.append(Paragraph("Conseiller augmenté CEGARA", style_title))
elements.append(Spacer(1, 20)) # Espacement après le titre
# Trier les chapitres par "num"
chapters = sorted(chapter_data, key=lambda x: x["num"])
for chapter in chapters:
# Ajouter le titre de la discussion
elements.append(Paragraph(f"Chapitre {chapter['num']}: {chapter['title']}", style_header))
elements.append(Spacer(1, 10))
if len(chapter["messages"]) > 1 :
for msg in chapter["messages"]:
if isinstance(msg, HumanMessage):
color = "blue"
sender = "Utilisateur"
elif isinstance(msg, AIMessage):
color = "green"
sender = "Conseiller augmenté CEGARA"
elif isinstance(msg, SystemMessage):
color = "red"
sender = "Système"
else:
color = "black"
sender = "Message"
elements.append(Paragraph(f"<b><font color='{color}'>{sender}</font></b>", style_message))
content = msg.content
content = markdown_to_html(content)
content = markdown_to_reportlab(content)
elements.append(Paragraph(content, style_message))
elements.append(Spacer(1, 10))
elements.append(Spacer(1, 15)) # Espacement entre discussions
doc.build(elements)
buffer.seek(0)
return buffer |