Spaces:

bziiit
/

AGENT_ANALYSE_RAG

Running

Ilyas KHIAT

first app

b31069e 12 months ago

3.6 kB

	import streamlit as st
	import pymupdf as fitz
	import pyperclip
	from utils.audit.audit_doc import audit_descriptif

	# Function to extract text from PDF
	def extract_text_from_pdf(file):
	document = fitz.open(stream=file.read(), filetype="pdf")
	full_text = ""
	for page_num in range(len(document)):
	page = document.load_page(page_num)
	text = page.get_text("text")
	full_text += text
	return full_text

	# Function to classify file type
	def classify_file(file):
	if file.type.startswith("image/"):
	return "image"
	elif file.type == "application/pdf":
	return "pdf"
	elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
	return "word"
	elif file.type.startswith("audio/"):
	return "audio"
	elif file.type.startswith("text/"):
	return "text"
	else:
	return "unknown"

	def main():
	# Streamlit app
	st.title("AUDIT DES DOCUMENTS")

	if "audit" not in st.session_state:
	st.session_state.audit = {}
	if "name_file" not in st.session_state:
	st.session_state.name_file = ""

	# File uploader
	uploaded_file = st.file_uploader("Télécharger un documents")

	if uploaded_file is not None:
	type = classify_file(uploaded_file)
	if type == "pdf":

	if st.session_state.name_file != uploaded_file.name:
	st.session_state.name_file = uploaded_file.name
	with st.spinner("Analyse du document..."):
	st.session_state.audit = audit_descriptif(uploaded_file)
	audit = st.session_state.audit

	#global audit
	audit_simplified = {
	"Nombre de pages": audit["number_of_pages"],
	"Nombre d'images": audit["number_of_images"],
	"Nombre de liens": audit["number_of_links"],
	"Nombre de tableaux": audit["number_of_tables"],
	"Nombre de tokens": audit["number_of_tokens"],
	"Nombre de mots": audit["number_of_words"]
	}

	well_formatted_audit = "Audit descriptif\n"
	for key, value in audit_simplified.items():
	well_formatted_audit += f"- {key}: {value}\n"

	st.write("### Audit de tout le document")
	st.code(well_formatted_audit)

	#audit par page
	with st.expander("Audit par page"):
	number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1)
	audit_page = audit[f"page_{number-1}"]
	audit_page = {
	"Nombre d'images": audit_page["number_of_images"],
	"Nombre de liens": audit_page["number_of_links"],
	"Nombre de tableaux": audit_page["number_of_tables"],
	"Nombre de tokens": audit_page["number_of_tokens"],
	"Nombre de mots": audit_page["number_of_words"]
	}
	well_formatted_audit_page = "Audit descriptif\n"
	for key, value in audit_page.items():
	well_formatted_audit_page += f"- {key}: {value}\n"

	st.code(well_formatted_audit_page)

	# # Button to copy text to clipboard
	# if st.button("Copy to Clipboard"):
	# pyperclip.copy(audit)
	# st.success("Text copied to clipboard successfully!")
	# else:
	# st.info("Please upload a PDF file to extract text.")

	if __name__ == "__main__":
	main()