Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import os
|
2 |
-
import
|
3 |
from pathlib import Path
|
4 |
from typing import List
|
5 |
from PyPDF2 import PdfReader
|
@@ -24,12 +24,6 @@ from docling.datamodel.pipeline_options import (
|
|
24 |
OUTPUT_DIR = Path("output")
|
25 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
26 |
|
27 |
-
FIGURES_DIR = OUTPUT_DIR / "figures"
|
28 |
-
FIGURES_DIR.mkdir(exist_ok=True)
|
29 |
-
|
30 |
-
TABLES_DIR = OUTPUT_DIR / "tables"
|
31 |
-
TABLES_DIR.mkdir(exist_ok=True)
|
32 |
-
|
33 |
# Vérification de validité des fichiers PDF
|
34 |
def is_valid_pdf(file_path):
|
35 |
try:
|
@@ -95,30 +89,35 @@ def create_document_converter(
|
|
95 |
)
|
96 |
|
97 |
# Interface Streamlit
|
98 |
-
st.title("Conversion de documents PDF avec OCR")
|
99 |
-
st.subheader("Téléchargez un PDF pour commencer le traitement")
|
100 |
|
101 |
-
uploaded_files = st.file_uploader(
|
|
|
|
|
102 |
use_ocr = st.checkbox("👁️🗨️ Activer l'OCR", value=True)
|
103 |
export_figures = st.checkbox("🖼️ Exporter les images", value=True)
|
104 |
export_tables = st.checkbox("📋 Exporter les tableaux", value=True)
|
105 |
ocr_engine = st.selectbox("Moteur OCR", ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"])
|
106 |
ocr_languages = st.text_input("Langues OCR (ex : eng, fra)", "eng").split(",")
|
107 |
-
table_mode = st.selectbox("Mode des tableaux", ["
|
108 |
|
109 |
if st.button("Convertir"):
|
110 |
if uploaded_files:
|
111 |
input_paths = []
|
112 |
generated_files = []
|
|
|
|
|
113 |
|
|
|
114 |
for uploaded_file in uploaded_files:
|
115 |
file_path = OUTPUT_DIR / uploaded_file.name
|
116 |
with open(file_path, "wb") as f:
|
117 |
f.write(uploaded_file.read())
|
118 |
-
st.write(f"Fichier reçu : {file_path} (
|
119 |
|
120 |
if not is_valid_pdf(file_path):
|
121 |
-
st.error(f"Le fichier {file_path.name} n'est pas un PDF valide.")
|
122 |
continue
|
123 |
input_paths.append(file_path)
|
124 |
|
@@ -133,25 +132,50 @@ if st.button("Convertir"):
|
|
133 |
ocr_languages=ocr_languages,
|
134 |
)
|
135 |
|
|
|
|
|
|
|
|
|
136 |
# Conversion des fichiers
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
else:
|
157 |
-
st.error("Veuillez télécharger au moins un fichier PDF.")
|
|
|
1 |
import os
|
2 |
+
import time
|
3 |
from pathlib import Path
|
4 |
from typing import List
|
5 |
from PyPDF2 import PdfReader
|
|
|
24 |
OUTPUT_DIR = Path("output")
|
25 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
# Vérification de validité des fichiers PDF
|
28 |
def is_valid_pdf(file_path):
|
29 |
try:
|
|
|
89 |
)
|
90 |
|
91 |
# Interface Streamlit
|
92 |
+
st.title("📄 Conversion de documents PDF avec OCR")
|
93 |
+
st.subheader("🖼️ Téléchargez un PDF pour commencer le traitement")
|
94 |
|
95 |
+
uploaded_files = st.file_uploader(
|
96 |
+
"Sélectionnez vos fichiers PDF", accept_multiple_files=True, type=["pdf"]
|
97 |
+
)
|
98 |
use_ocr = st.checkbox("👁️🗨️ Activer l'OCR", value=True)
|
99 |
export_figures = st.checkbox("🖼️ Exporter les images", value=True)
|
100 |
export_tables = st.checkbox("📋 Exporter les tableaux", value=True)
|
101 |
ocr_engine = st.selectbox("Moteur OCR", ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"])
|
102 |
ocr_languages = st.text_input("Langues OCR (ex : eng, fra)", "eng").split(",")
|
103 |
+
table_mode = st.selectbox("Mode des tableaux", ["ACCURATE", "FAST"])
|
104 |
|
105 |
if st.button("Convertir"):
|
106 |
if uploaded_files:
|
107 |
input_paths = []
|
108 |
generated_files = []
|
109 |
+
total_files = len(uploaded_files)
|
110 |
+
start_time = time.time() # Chronomètre de démarrage
|
111 |
|
112 |
+
# Charger les fichiers téléchargés
|
113 |
for uploaded_file in uploaded_files:
|
114 |
file_path = OUTPUT_DIR / uploaded_file.name
|
115 |
with open(file_path, "wb") as f:
|
116 |
f.write(uploaded_file.read())
|
117 |
+
st.write(f"📥 Fichier reçu : `{file_path.name}` ({os.path.getsize(file_path)} octets)")
|
118 |
|
119 |
if not is_valid_pdf(file_path):
|
120 |
+
st.error(f"❌ Le fichier {file_path.name} n'est pas un PDF valide.")
|
121 |
continue
|
122 |
input_paths.append(file_path)
|
123 |
|
|
|
132 |
ocr_languages=ocr_languages,
|
133 |
)
|
134 |
|
135 |
+
# Barre de progression et cercle de chargement
|
136 |
+
progress_bar = st.progress(0)
|
137 |
+
status_placeholder = st.empty()
|
138 |
+
|
139 |
# Conversion des fichiers
|
140 |
+
for i, file_path in enumerate(input_paths):
|
141 |
+
status_placeholder.info(
|
142 |
+
f"🔄 Traitement de `{file_path.name}` ({i + 1}/{total_files})"
|
143 |
+
)
|
144 |
+
file_start_time = time.time()
|
145 |
+
|
146 |
+
# Conversion du fichier
|
147 |
+
conv_results = list(converter.convert_all([file_path], raises_on_error=False))
|
148 |
+
for conv_res in conv_results:
|
149 |
+
if conv_res.status == ConversionStatus.SUCCESS:
|
150 |
+
st.success(f"✅ Conversion réussie : `{conv_res.input.file}`")
|
151 |
+
output_file = OUTPUT_DIR / f"{conv_res.input.file.stem}.md"
|
152 |
+
with open(output_file, "w") as f:
|
153 |
+
f.write(f"## Contenu converti pour `{conv_res.input.file}`")
|
154 |
+
generated_files.append(output_file)
|
155 |
+
else:
|
156 |
+
st.error(f"❌ Échec de la conversion pour : `{conv_res.input.file}`")
|
157 |
+
|
158 |
+
# Temps estimé restant
|
159 |
+
elapsed_time = time.time() - file_start_time
|
160 |
+
remaining_files = total_files - (i + 1)
|
161 |
+
estimated_time_remaining = elapsed_time * remaining_files
|
162 |
+
status_placeholder.info(
|
163 |
+
f"⏳ Temps restant estimé : {int(estimated_time_remaining)} secondes"
|
164 |
+
)
|
165 |
+
|
166 |
+
# Mise à jour de la barre de progression
|
167 |
+
progress_bar.progress((i + 1) / total_files)
|
168 |
+
|
169 |
+
# Affichage des fichiers générés
|
170 |
+
st.subheader("📂 Fichiers générés")
|
171 |
+
for generated_file in generated_files:
|
172 |
+
st.markdown(f"📄 **{generated_file.name}**")
|
173 |
+
with open(generated_file, "r") as f:
|
174 |
+
content = f.read()
|
175 |
+
st.text_area(f"Prévisualisation : {generated_file.name}", value=content, height=200)
|
176 |
+
|
177 |
+
# Temps total écoulé
|
178 |
+
total_time = time.time() - start_time
|
179 |
+
st.success(f"✅ Conversion terminée en {int(total_time)} secondes !")
|
180 |
else:
|
181 |
+
st.error("❌ Veuillez télécharger au moins un fichier PDF.")
|