import pymupdf import tiktoken import markdown import re from io import BytesIO from reportlab.lib.pagesizes import A4 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer from reportlab.lib.styles import getSampleStyleSheet from reportlab.lib.enums import TA_CENTER from langchain_core.messages import AIMessage, HumanMessage, SystemMessage def count_tokens(input_string: str) -> int: tokenizer = tiktoken.get_encoding("cl100k_base") tokens = tokenizer.encode(input_string) return len(tokens) def audit_descriptif_pdf(file,max_img_width=500) -> dict: document = pymupdf.open(stream=file.read()) audit_dict_doc = { "number_of_pages": len(document), "number_of_images": 0, "number_of_links": 0, "number_of_tables": 0, "number_of_tokens": 0, "number_of_words": 0, "key_words": [] } doc_content = dict() for page in document: audit_dict_page = {} page_content = { "images": [], "texte": "", "liens": [], "tableaux": [] } #number of images images = page.get_images() number_images = len(images) audit_dict_page["number_of_images"] = number_images audit_dict_doc["number_of_images"] += number_images #get images for _, img in enumerate(images): xref = img[0] base_image = document.extract_image(xref) image_bytes = base_image["image"] image_width = base_image["width"] image_height = base_image["height"] # Adjust image size if it exceeds the maximum width if image_width > max_img_width: ratio = max_img_width / image_width image_width = max_img_width image_height = int(image_height * ratio) page_content["images"].append((image_bytes, image_width, image_height)) #get links with uri links = [] for link in page.get_links(): if link['kind'] == pymupdf.LINK_URI and 'uri' in link: links.append({"uri": link["uri"], "page": page.number}) page_content["liens"] = links #number of links number_links = len(links) audit_dict_page["number_of_links"] = number_links audit_dict_doc["number_of_links"] += number_links #number of tables tables = page.find_tables().tables number_tables = len(tables) for tab in tables: page_content["tableaux"].append(tab.to_pandas()) audit_dict_page["number_of_tables"] = number_tables audit_dict_doc["number_of_tables"] += number_tables #number of tokens and words text = page.get_text("text") number_tokens = count_tokens(text) number_words = len(text.split()) audit_dict_page["number_of_tokens"] = number_tokens audit_dict_page["number_of_words"] = number_words #get text page_content["texte"] = text audit_dict_doc["number_of_tokens"] += number_tokens audit_dict_doc["number_of_words"] += number_words audit_dict_doc[f"page_{page.number}"] = audit_dict_page doc_content[f"page_{page.number}"] = page_content #merge 2 dicts global_audit = { "audit": audit_dict_doc, "content": doc_content } return global_audit # Fonction pour convertir le Markdown en HTML pour le PDF def markdown_to_html(md_text): return markdown.markdown(md_text, output_format='html' ) html = """