Ilyas KHIAT
first app
b31069e
raw
history blame
3.6 kB
import streamlit as st
import pymupdf as fitz
import pyperclip
from utils.audit.audit_doc import audit_descriptif
# Function to extract text from PDF
def extract_text_from_pdf(file):
document = fitz.open(stream=file.read(), filetype="pdf")
full_text = ""
for page_num in range(len(document)):
page = document.load_page(page_num)
text = page.get_text("text")
full_text += text
return full_text
# Function to classify file type
def classify_file(file):
if file.type.startswith("image/"):
return "image"
elif file.type == "application/pdf":
return "pdf"
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return "word"
elif file.type.startswith("audio/"):
return "audio"
elif file.type.startswith("text/"):
return "text"
else:
return "unknown"
def main():
# Streamlit app
st.title("AUDIT DES DOCUMENTS")
if "audit" not in st.session_state:
st.session_state.audit = {}
if "name_file" not in st.session_state:
st.session_state.name_file = ""
# File uploader
uploaded_file = st.file_uploader("Télécharger un documents")
if uploaded_file is not None:
type = classify_file(uploaded_file)
if type == "pdf":
if st.session_state.name_file != uploaded_file.name:
st.session_state.name_file = uploaded_file.name
with st.spinner("Analyse du document..."):
st.session_state.audit = audit_descriptif(uploaded_file)
audit = st.session_state.audit
#global audit
audit_simplified = {
"Nombre de pages": audit["number_of_pages"],
"Nombre d'images": audit["number_of_images"],
"Nombre de liens": audit["number_of_links"],
"Nombre de tableaux": audit["number_of_tables"],
"Nombre de tokens": audit["number_of_tokens"],
"Nombre de mots": audit["number_of_words"]
}
well_formatted_audit = "Audit descriptif\n"
for key, value in audit_simplified.items():
well_formatted_audit += f"- {key}: {value}\n"
st.write("### Audit de tout le document")
st.code(well_formatted_audit)
#audit par page
with st.expander("Audit par page"):
number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1)
audit_page = audit[f"page_{number-1}"]
audit_page = {
"Nombre d'images": audit_page["number_of_images"],
"Nombre de liens": audit_page["number_of_links"],
"Nombre de tableaux": audit_page["number_of_tables"],
"Nombre de tokens": audit_page["number_of_tokens"],
"Nombre de mots": audit_page["number_of_words"]
}
well_formatted_audit_page = "Audit descriptif\n"
for key, value in audit_page.items():
well_formatted_audit_page += f"- {key}: {value}\n"
st.code(well_formatted_audit_page)
# # Button to copy text to clipboard
# if st.button("Copy to Clipboard"):
# pyperclip.copy(audit)
# st.success("Text copied to clipboard successfully!")
# else:
# st.info("Please upload a PDF file to extract text.")
if __name__ == "__main__":
main()