import pymupdf import tiktoken import markdown import re from io import BytesIO from reportlab.lib.pagesizes import A4 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer from reportlab.lib.styles import getSampleStyleSheet from reportlab.lib.enums import TA_CENTER from langchain_core.messages import AIMessage, HumanMessage, SystemMessage def count_tokens(input_string: str) -> int: tokenizer = tiktoken.get_encoding("cl100k_base") tokens = tokenizer.encode(input_string) return len(tokens) def audit_descriptif_pdf(file,max_img_width=500) -> dict: document = pymupdf.open(stream=file.read()) audit_dict_doc = { "number_of_pages": len(document), "number_of_images": 0, "number_of_links": 0, "number_of_tables": 0, "number_of_tokens": 0, "number_of_words": 0, "key_words": [] } doc_content = dict() for page in document: audit_dict_page = {} page_content = { "images": [], "texte": "", "liens": [], "tableaux": [] } #number of images images = page.get_images() number_images = len(images) audit_dict_page["number_of_images"] = number_images audit_dict_doc["number_of_images"] += number_images #get images for _, img in enumerate(images): xref = img[0] base_image = document.extract_image(xref) image_bytes = base_image["image"] image_width = base_image["width"] image_height = base_image["height"] # Adjust image size if it exceeds the maximum width if image_width > max_img_width: ratio = max_img_width / image_width image_width = max_img_width image_height = int(image_height * ratio) page_content["images"].append((image_bytes, image_width, image_height)) #get links with uri links = [] for link in page.get_links(): if link['kind'] == pymupdf.LINK_URI and 'uri' in link: links.append({"uri": link["uri"], "page": page.number}) page_content["liens"] = links #number of links number_links = len(links) audit_dict_page["number_of_links"] = number_links audit_dict_doc["number_of_links"] += number_links #number of tables tables = page.find_tables().tables number_tables = len(tables) for tab in tables: page_content["tableaux"].append(tab.to_pandas()) audit_dict_page["number_of_tables"] = number_tables audit_dict_doc["number_of_tables"] += number_tables #number of tokens and words text = page.get_text("text") number_tokens = count_tokens(text) number_words = len(text.split()) audit_dict_page["number_of_tokens"] = number_tokens audit_dict_page["number_of_words"] = number_words #get text page_content["texte"] = text audit_dict_doc["number_of_tokens"] += number_tokens audit_dict_doc["number_of_words"] += number_words audit_dict_doc[f"page_{page.number}"] = audit_dict_page doc_content[f"page_{page.number}"] = page_content #merge 2 dicts global_audit = { "audit": audit_dict_doc, "content": doc_content } return global_audit # Fonction pour convertir le Markdown en HTML pour le PDF def markdown_to_html(md_text): return markdown.markdown(md_text, output_format='html' ) html = """

Conseiller augmenté CEGARA

""" # Trier les chapitres par "num" chapters = sorted(chapter_data, key=lambda x: x["num"]) for chapter in chapters: html += f"

Chapitre {chapter['num']}: {chapter['title']}

" if len(chapter["messages"]) > 1 : # Affichage des messages for msg in chapter["messages"]: if isinstance(msg, HumanMessage): sender = "Utilisateur" css_class = "human" elif isinstance(msg, AIMessage): sender = "IA" css_class = "ai" elif isinstance(msg, SystemMessage): sender = "Système" css_class = "system" else: sender = "Message" css_class = "" html += f"""
{sender} : {markdown_to_html(msg.content)}
""" html += "" return generate_pdf_from_html(html) def generate_pdf(chapter_data: list, displayPromptSystem = True): buffer = BytesIO() doc = SimpleDocTemplate(buffer, pagesize=A4) styles = getSampleStyleSheet() style_title = styles["Title"] style_title.alignment = TA_CENTER # Centrer le titre style_header = styles["Heading2"] style_message = styles["BodyText"] elements = [] # Titre principal du document elements.append(Paragraph("Conseiller augmenté CEGARA", style_title)) elements.append(Spacer(1, 20)) # Espacement après le titre # Trier les chapitres par "num" chapters = sorted(chapter_data, key=lambda x: x["num"]) for chapter in chapters: # Ajouter le titre de la discussion elements.append(Paragraph(f"Chapitre {chapter['num']}: {chapter['title']}", style_header)) elements.append(Spacer(1, 10)) if len(chapter["messages"]) > 1 : for msg in chapter["messages"]: if isinstance(msg, HumanMessage): color = "blue" sender = "Utilisateur" elif isinstance(msg, AIMessage): color = "green" sender = "Conseiller augmenté CEGARA" elif displayPromptSystem and isinstance(msg, SystemMessage): color = "red" sender = "Système" else: color = "black" sender = "Message" elements.append(Paragraph(f"{sender}", style_message)) content = msg.content content = markdown_to_html(content) elements.append(Paragraph(content, style_message)) elements.append(Spacer(1, 10)) elements.append(Spacer(1, 15)) # Espacement entre discussions doc.build(elements) buffer.seek(0) return buffer