Spaces:
Running
Running
Artemis-IA
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
import os
|
2 |
import time
|
3 |
-
import datetime
|
4 |
from pathlib import Path
|
5 |
-
from typing import List
|
6 |
from PIL import Image
|
7 |
-
from PyPDF2 import PdfReader
|
8 |
import streamlit as st
|
9 |
import pandas as pd
|
|
|
|
|
10 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
11 |
from docling.datamodel.base_models import InputFormat
|
12 |
from docling.datamodel.document import ConversionStatus
|
@@ -24,70 +24,47 @@ from docling.datamodel.pipeline_options import (
|
|
24 |
)
|
25 |
from docling_core.types.doc import PictureItem, TableItem
|
26 |
|
27 |
-
#
|
28 |
OUTPUT_DIR = Path("output")
|
29 |
-
OUTPUT_DIR.mkdir(exist_ok=True)
|
30 |
-
|
31 |
FIGURES_DIR = OUTPUT_DIR / "figures"
|
32 |
-
FIGURES_DIR.mkdir(exist_ok=True)
|
33 |
-
|
34 |
TABLES_DIR = OUTPUT_DIR / "tables"
|
35 |
-
TABLES_DIR.mkdir(exist_ok=True)
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
return False
|
48 |
-
|
49 |
-
# Fonction pour configurer le convertisseur de documents
|
50 |
-
def create_document_converter(
|
51 |
-
use_ocr: bool,
|
52 |
-
export_figures: bool,
|
53 |
-
export_tables: bool,
|
54 |
-
accelerator: str,
|
55 |
-
ocr_engine: str,
|
56 |
-
table_mode: str,
|
57 |
-
ocr_languages: List[str],
|
58 |
-
) -> DocumentConverter:
|
59 |
accelerator_options = AcceleratorOptions(
|
60 |
num_threads=8,
|
61 |
-
device=AcceleratorDevice[accelerator.upper()]
|
62 |
)
|
63 |
|
64 |
table_structure_options = TableStructureOptions(
|
65 |
-
mode=TableFormerMode[table_mode.upper()],
|
66 |
-
do_cell_matching=True
|
67 |
)
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
elif ocr_engine == "rapidocr":
|
77 |
-
ocr_options = RapidOcrOptions(lang=ocr_languages)
|
78 |
-
elif ocr_engine == "ocrmac":
|
79 |
-
ocr_options = OcrMacOptions(lang=ocr_languages)
|
80 |
-
else:
|
81 |
-
raise ValueError(f"Moteur OCR non pris en charge : {ocr_engine}")
|
82 |
|
83 |
pipeline_options = PdfPipelineOptions(
|
84 |
-
do_ocr=use_ocr,
|
85 |
generate_page_images=True,
|
86 |
-
generate_picture_images=export_figures,
|
87 |
-
generate_table_images=export_tables,
|
88 |
accelerator_options=accelerator_options,
|
89 |
table_structure_options=table_structure_options,
|
90 |
-
ocr_options=
|
91 |
)
|
92 |
|
93 |
return DocumentConverter(
|
@@ -96,130 +73,215 @@ def create_document_converter(
|
|
96 |
InputFormat.DOCX,
|
97 |
InputFormat.PPTX,
|
98 |
InputFormat.HTML,
|
99 |
-
InputFormat.IMAGE
|
100 |
],
|
101 |
-
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
|
102 |
)
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
)
|
111 |
-
use_ocr = st.checkbox("👁️🗨️ Activer l'OCR", value=True)
|
112 |
-
export_figures = st.checkbox("🖼️ Exporter les images", value=True)
|
113 |
-
export_tables = st.checkbox("📋 Exporter les tableaux", value=True)
|
114 |
-
ocr_engine = st.selectbox("Moteur OCR", ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"])
|
115 |
-
ocr_languages = st.text_input("Langues OCR (ex : en, fr)", "en").split(",")
|
116 |
-
table_mode = st.selectbox("Mode des tableaux", ["ACCURATE", "FAST"])
|
117 |
-
export_formats = st.multiselect(
|
118 |
-
"Formats d'exportation", ["json", "yaml", "md", "multimodal"], default=["md"]
|
119 |
-
)
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
input_paths = []
|
124 |
-
generated_files = []
|
125 |
-
figures = []
|
126 |
-
tables = []
|
127 |
-
total_files = len(uploaded_files)
|
128 |
-
start_time = time.time() # Chronomètre de démarrage
|
129 |
-
|
130 |
-
# Charger les fichiers téléchargés
|
131 |
-
for uploaded_file in uploaded_files:
|
132 |
file_path = OUTPUT_DIR / uploaded_file.name
|
133 |
-
|
134 |
-
|
135 |
-
st.write(f"📥 Fichier reçu : `{file_path.name}` ({os.path.getsize(file_path)} octets)")
|
136 |
-
|
137 |
if not is_valid_file(file_path):
|
138 |
continue
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
converter = create_document_converter(
|
143 |
-
use_ocr,
|
144 |
-
export_figures,
|
145 |
-
export_tables,
|
146 |
-
accelerator="cpu",
|
147 |
-
ocr_engine=ocr_engine,
|
148 |
-
table_mode=table_mode,
|
149 |
-
ocr_languages=ocr_languages,
|
150 |
-
)
|
151 |
-
|
152 |
-
# Barre de progression
|
153 |
-
progress_bar = st.progress(0)
|
154 |
-
status_placeholder = st.empty()
|
155 |
-
|
156 |
-
# Conversion des fichiers
|
157 |
-
for i, file_path in enumerate(input_paths):
|
158 |
-
status_placeholder.info(
|
159 |
-
f"🔄 Traitement de `{file_path.name}` ({i + 1}/{total_files})"
|
160 |
-
)
|
161 |
-
|
162 |
-
# Conversion du fichier
|
163 |
conv_results = list(converter.convert_all([file_path], raises_on_error=False))
|
|
|
164 |
for conv_res in conv_results:
|
165 |
if conv_res.status == ConversionStatus.SUCCESS:
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
else:
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
st.
|
201 |
-
for
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import time
|
|
|
3 |
from pathlib import Path
|
4 |
+
from typing import List, Dict
|
5 |
from PIL import Image
|
|
|
6 |
import streamlit as st
|
7 |
import pandas as pd
|
8 |
+
import json
|
9 |
+
import yaml
|
10 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
11 |
from docling.datamodel.base_models import InputFormat
|
12 |
from docling.datamodel.document import ConversionStatus
|
|
|
24 |
)
|
25 |
from docling_core.types.doc import PictureItem, TableItem
|
26 |
|
27 |
+
# Configuration des répertoires
|
28 |
OUTPUT_DIR = Path("output")
|
|
|
|
|
29 |
FIGURES_DIR = OUTPUT_DIR / "figures"
|
|
|
|
|
30 |
TABLES_DIR = OUTPUT_DIR / "tables"
|
|
|
31 |
|
32 |
+
def setup_directories():
|
33 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
34 |
+
FIGURES_DIR.mkdir(exist_ok=True)
|
35 |
+
TABLES_DIR.mkdir(exist_ok=True)
|
36 |
+
|
37 |
+
def is_valid_file(file_path: Path) -> bool:
|
38 |
+
valid_extensions = [".pdf", ".docx", ".pptx", ".html", ".png", ".jpg"]
|
39 |
+
return file_path.suffix.lower() in valid_extensions
|
40 |
+
|
41 |
+
def create_document_converter(config: Dict) -> DocumentConverter:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
accelerator_options = AcceleratorOptions(
|
43 |
num_threads=8,
|
44 |
+
device=AcceleratorDevice[config['accelerator'].upper()]
|
45 |
)
|
46 |
|
47 |
table_structure_options = TableStructureOptions(
|
48 |
+
mode=TableFormerMode[config['table_mode'].upper()],
|
49 |
+
do_cell_matching=True
|
50 |
)
|
51 |
|
52 |
+
ocr_engines = {
|
53 |
+
"easyocr": EasyOcrOptions(lang=config['ocr_languages']),
|
54 |
+
"tesseract_cli": TesseractCliOcrOptions(lang=config['ocr_languages']),
|
55 |
+
"tesserocr": TesseractOcrOptions(lang=config['ocr_languages']),
|
56 |
+
"rapidocr": RapidOcrOptions(lang=config['ocr_languages']),
|
57 |
+
"ocrmac": OcrMacOptions(lang=config['ocr_languages'])
|
58 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
pipeline_options = PdfPipelineOptions(
|
61 |
+
do_ocr=config['use_ocr'],
|
62 |
generate_page_images=True,
|
63 |
+
generate_picture_images=config['export_figures'],
|
64 |
+
generate_table_images=config['export_tables'],
|
65 |
accelerator_options=accelerator_options,
|
66 |
table_structure_options=table_structure_options,
|
67 |
+
ocr_options=ocr_engines[config['ocr_engine']]
|
68 |
)
|
69 |
|
70 |
return DocumentConverter(
|
|
|
73 |
InputFormat.DOCX,
|
74 |
InputFormat.PPTX,
|
75 |
InputFormat.HTML,
|
76 |
+
InputFormat.IMAGE
|
77 |
],
|
78 |
+
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
|
79 |
)
|
80 |
|
81 |
+
def process_files(uploaded_files, config: Dict) -> Dict:
|
82 |
+
setup_directories()
|
83 |
+
converter = create_document_converter(config)
|
84 |
+
results = {
|
85 |
+
'figures': [],
|
86 |
+
'tables_csv': [],
|
87 |
+
'tables_html': [],
|
88 |
+
'exports': {fmt: [] for fmt in config['export_formats']}
|
89 |
+
}
|
90 |
|
91 |
+
progress_bar = st.progress(0)
|
92 |
+
status_placeholder = st.empty()
|
93 |
+
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
+
for idx, uploaded_file in enumerate(uploaded_files):
|
96 |
+
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
file_path = OUTPUT_DIR / uploaded_file.name
|
98 |
+
file_path.write_bytes(uploaded_file.getbuffer())
|
99 |
+
|
|
|
|
|
100 |
if not is_valid_file(file_path):
|
101 |
continue
|
102 |
+
|
103 |
+
status_placeholder.info(f"Traitement de {file_path.name} ({idx+1}/{len(uploaded_files)})")
|
104 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
conv_results = list(converter.convert_all([file_path], raises_on_error=False))
|
106 |
+
|
107 |
for conv_res in conv_results:
|
108 |
if conv_res.status == ConversionStatus.SUCCESS:
|
109 |
+
handle_successful_conversion(conv_res, results, config['export_formats'])
|
110 |
+
|
111 |
+
progress_bar.progress((idx + 1) / len(uploaded_files))
|
112 |
+
|
113 |
+
except Exception as e:
|
114 |
+
st.error(f"Erreur avec {uploaded_file.name}: {str(e)}")
|
115 |
+
|
116 |
+
results['processing_time'] = time.time() - start_time
|
117 |
+
return results
|
118 |
+
|
119 |
+
def handle_successful_conversion(conv_res, results: Dict, export_formats: List[str]):
|
120 |
+
# Export des formats de document
|
121 |
+
for fmt in export_formats:
|
122 |
+
output_file = OUTPUT_DIR / f"{conv_res.input.file.stem}.{fmt}"
|
123 |
+
with open(output_file, "w") as f:
|
124 |
+
if fmt == "md":
|
125 |
+
content = conv_res.document.export_to_markdown()
|
126 |
+
f.write(content)
|
127 |
+
results['exports']['md'].append((output_file, content))
|
128 |
+
elif fmt == "json":
|
129 |
+
content = json.dumps(conv_res.document.export_to_dict(), ensure_ascii=False, indent=2)
|
130 |
+
f.write(content)
|
131 |
+
results['exports']['json'].append((output_file, content))
|
132 |
+
elif fmt == "yaml":
|
133 |
+
content = yaml.dump(conv_res.document.export_to_dict(), allow_unicode=True)
|
134 |
+
f.write(content)
|
135 |
+
results['exports']['yaml'].append((output_file, content))
|
136 |
+
elif fmt == "multimodal":
|
137 |
+
results['exports']['multimodal'].append(output_file)
|
138 |
+
|
139 |
+
# Extraction des éléments
|
140 |
+
for element, _ in conv_res.document.iterate_items():
|
141 |
+
if isinstance(element, PictureItem):
|
142 |
+
handle_picture_element(element, conv_res, results)
|
143 |
+
elif isinstance(element, TableItem):
|
144 |
+
handle_table_element(element, conv_res, results)
|
145 |
+
|
146 |
+
def handle_picture_element(element: PictureItem, conv_res, results: Dict):
|
147 |
+
fig_path = FIGURES_DIR / f"{conv_res.input.file.stem}_figure_{len(results['figures'])}.png"
|
148 |
+
element.image.pil_image.save(fig_path)
|
149 |
+
results['figures'].append(fig_path)
|
150 |
+
|
151 |
+
def handle_table_element(element: TableItem, conv_res, results: Dict):
|
152 |
+
csv_path = TABLES_DIR / f"{conv_res.input.file.stem}_table_{len(results['tables_csv'])}.csv"
|
153 |
+
element.export_to_dataframe().to_csv(csv_path, index=False)
|
154 |
+
results['tables_csv'].append(csv_path)
|
155 |
+
|
156 |
+
html_path = TABLES_DIR / f"{conv_res.input.file.stem}_table_{len(results['tables_html'])}.html"
|
157 |
+
with open(html_path, "w") as f:
|
158 |
+
f.write(element.export_to_html())
|
159 |
+
results['tables_html'].append(html_path)
|
160 |
+
|
161 |
+
def display_export_content(title: str, content: str, format: str):
|
162 |
+
with st.expander(f"📄 {title}"):
|
163 |
+
if format == "md":
|
164 |
+
st.markdown(content)
|
165 |
+
elif format in ["json", "yaml"]:
|
166 |
+
st.code(content, language=format)
|
167 |
+
elif format == "multimodal":
|
168 |
+
st.info("Affichage multimodal combinant texte, images et tableaux")
|
169 |
+
st.markdown(content)
|
170 |
+
|
171 |
+
def display_results(results: Dict):
|
172 |
+
st.session_state.time_placeholder.success(f"⏱ Temps total de conversion : {int(results['processing_time'])} secondes")
|
173 |
+
|
174 |
+
# Affichage des exports
|
175 |
+
for fmt, exports in results['exports'].items():
|
176 |
+
if exports:
|
177 |
+
st.subheader(f"📁 Exports {fmt.upper()}")
|
178 |
+
for export in exports:
|
179 |
+
if fmt == "multimodal":
|
180 |
+
display_multimodal_result(export)
|
181 |
else:
|
182 |
+
file_path, content = export
|
183 |
+
display_export_content(file_path.name, content, fmt)
|
184 |
+
|
185 |
+
# Section des figures
|
186 |
+
if results['figures']:
|
187 |
+
st.subheader("🖼️ Figures extraites")
|
188 |
+
cols = st.columns(3)
|
189 |
+
for idx, fig_path in enumerate(results['figures']):
|
190 |
+
try:
|
191 |
+
cols[idx % 3].image(Image.open(fig_path), caption=fig_path.name, use_container_width=True)
|
192 |
+
except Exception as e:
|
193 |
+
cols[idx % 3].error(f"Erreur d'affichage de {fig_path.name}")
|
194 |
+
|
195 |
+
# Section des tableaux
|
196 |
+
if results['tables_csv'] or results['tables_html']:
|
197 |
+
st.subheader("📋 Tableaux extraits")
|
198 |
+
display_format = st.radio("Format d'affichage", ['CSV', 'HTML'], horizontal=True)
|
199 |
+
|
200 |
+
if display_format == 'CSV':
|
201 |
+
for table_path in results['tables_csv']:
|
202 |
+
try:
|
203 |
+
df = pd.read_csv(table_path)
|
204 |
+
st.write(f"**{table_path.stem}**")
|
205 |
+
st.dataframe(df.style.set_properties(**{'text-align': 'left'}))
|
206 |
+
except Exception as e:
|
207 |
+
st.error(f"Erreur de lecture CSV {table_path.name}: {str(e)}")
|
208 |
+
else:
|
209 |
+
for html_path in results['tables_html']:
|
210 |
+
try:
|
211 |
+
with open(html_path, "r") as f:
|
212 |
+
st.write(f"**{html_path.stem}**")
|
213 |
+
st.markdown(f.read(), unsafe_allow_html=True)
|
214 |
+
except Exception as e:
|
215 |
+
st.error(f"Erreur de lecture HTML {html_path.name}: {str(e)}")
|
216 |
+
|
217 |
+
def display_multimodal_result(file_path: Path):
|
218 |
+
with st.expander(f"🌈 {file_path.name}"):
|
219 |
+
col1, col2 = st.columns([2, 1])
|
220 |
+
|
221 |
+
with col1:
|
222 |
+
try:
|
223 |
+
with open(file_path, "r") as f:
|
224 |
+
content = f.read()
|
225 |
+
st.markdown(content)
|
226 |
+
except Exception as e:
|
227 |
+
st.error(f"Erreur de lecture : {str(e)}")
|
228 |
+
|
229 |
+
with col2:
|
230 |
+
related_files = [
|
231 |
+
f for f in OUTPUT_DIR.glob(f"{file_path.stem}*")
|
232 |
+
if f != file_path and not f.is_dir()
|
233 |
+
]
|
234 |
+
|
235 |
+
if related_files:
|
236 |
+
st.write("Fichiers associés :")
|
237 |
+
for f in related_files:
|
238 |
+
st.write(f"- `{f.name}`")
|
239 |
+
if f.suffix in [".png", ".jpg"]:
|
240 |
+
st.image(Image.open(f), use_column_width=True)
|
241 |
+
elif f.suffix == ".csv":
|
242 |
+
try:
|
243 |
+
st.dataframe(pd.read_csv(f).head(3))
|
244 |
+
except Exception as e:
|
245 |
+
st.error(f"Erreur d'affichage CSV : {str(e)}")
|
246 |
+
|
247 |
+
# Interface utilisateur
|
248 |
+
def main():
|
249 |
+
st.title("📊🦆 Docling Document Converter")
|
250 |
+
st.session_state.time_placeholder = st.empty()
|
251 |
+
|
252 |
+
uploaded_files = st.file_uploader(
|
253 |
+
"Téléchargez vos documents",
|
254 |
+
accept_multiple_files=True,
|
255 |
+
type=["pdf", "docx", "pptx", "html", "png", "jpg"]
|
256 |
+
)
|
257 |
+
|
258 |
+
with st.expander("Options avancées"):
|
259 |
+
config = {
|
260 |
+
'use_ocr': st.checkbox("Activer OCR", True),
|
261 |
+
'export_figures': st.checkbox("Exporter les images", True),
|
262 |
+
'export_tables': st.checkbox("Exporter les tableaux", True),
|
263 |
+
'ocr_engine': st.selectbox("Moteur OCR", ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"]),
|
264 |
+
'ocr_languages': st.text_input("Langues OCR (séparées par des virgules)", "en").split(','),
|
265 |
+
'table_mode': st.selectbox("Mode des tableaux", ["ACCURATE", "FAST"]),
|
266 |
+
'export_formats': st.multiselect(
|
267 |
+
"Formats d'export",
|
268 |
+
["json", "yaml", "md", "multimodal"],
|
269 |
+
default=["md"]
|
270 |
+
),
|
271 |
+
'accelerator': st.selectbox("Accélérateur matériel", ["cpu", "cuda", "mps"], index=0)
|
272 |
+
}
|
273 |
+
|
274 |
+
if st.button("Démarrer la conversion"):
|
275 |
+
if uploaded_files:
|
276 |
+
results = process_files(uploaded_files, config)
|
277 |
+
display_results(results)
|
278 |
+
st.success("✅ Conversion terminée avec succès !")
|
279 |
+
st.download_button("Télécharger tous les résultats",
|
280 |
+
data=os.path.join(OUTPUT_DIR),
|
281 |
+
file_name="conversion_results.zip",
|
282 |
+
mime="application/zip")
|
283 |
+
else:
|
284 |
+
st.error("⚠️ Veuillez télécharger au moins un fichier")
|
285 |
+
|
286 |
+
if __name__ == "__main__":
|
287 |
+
main()
|