Spaces:
Running
Running
import streamlit as st | |
import pymupdf as fitz | |
import pyperclip | |
from utils.audit.audit_doc import audit_descriptif | |
# Function to extract text from PDF | |
def extract_text_from_pdf(file): | |
document = fitz.open(stream=file.read(), filetype="pdf") | |
full_text = "" | |
for page_num in range(len(document)): | |
page = document.load_page(page_num) | |
text = page.get_text("text") | |
full_text += text | |
return full_text | |
# Function to classify file type | |
def classify_file(file): | |
if file.type.startswith("image/"): | |
return "image" | |
elif file.type == "application/pdf": | |
return "pdf" | |
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
return "word" | |
elif file.type.startswith("audio/"): | |
return "audio" | |
elif file.type.startswith("text/"): | |
return "text" | |
else: | |
return "unknown" | |
def main(): | |
# Streamlit app | |
st.title("AUDIT DES DOCUMENTS") | |
if "audit" not in st.session_state: | |
st.session_state.audit = {} | |
if "name_file" not in st.session_state: | |
st.session_state.name_file = "" | |
# File uploader | |
uploaded_file = st.file_uploader("Télécharger un documents") | |
if uploaded_file is not None: | |
type = classify_file(uploaded_file) | |
if type == "pdf": | |
if st.session_state.name_file != uploaded_file.name: | |
st.session_state.name_file = uploaded_file.name | |
with st.spinner("Analyse du document..."): | |
st.session_state.audit = audit_descriptif(uploaded_file) | |
audit = st.session_state.audit | |
#global audit | |
audit_simplified = { | |
"Nombre de pages": audit["number_of_pages"], | |
"Nombre d'images": audit["number_of_images"], | |
"Nombre de liens": audit["number_of_links"], | |
"Nombre de tableaux": audit["number_of_tables"], | |
"Nombre de tokens": audit["number_of_tokens"], | |
"Nombre de mots": audit["number_of_words"] | |
} | |
well_formatted_audit = "Audit descriptif\n" | |
for key, value in audit_simplified.items(): | |
well_formatted_audit += f"- {key}: {value}\n" | |
st.write("### Audit de tout le document") | |
st.code(well_formatted_audit) | |
#audit par page | |
with st.expander("Audit par page"): | |
number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1) | |
audit_page = audit[f"page_{number-1}"] | |
audit_page = { | |
"Nombre d'images": audit_page["number_of_images"], | |
"Nombre de liens": audit_page["number_of_links"], | |
"Nombre de tableaux": audit_page["number_of_tables"], | |
"Nombre de tokens": audit_page["number_of_tokens"], | |
"Nombre de mots": audit_page["number_of_words"] | |
} | |
well_formatted_audit_page = "Audit descriptif\n" | |
for key, value in audit_page.items(): | |
well_formatted_audit_page += f"- {key}: {value}\n" | |
st.code(well_formatted_audit_page) | |
# # Button to copy text to clipboard | |
# if st.button("Copy to Clipboard"): | |
# pyperclip.copy(audit) | |
# st.success("Text copied to clipboard successfully!") | |
# else: | |
# st.info("Please upload a PDF file to extract text.") | |
if __name__ == "__main__": | |
main() | |