Artemis-IA commited on
Commit
6a94414
·
verified ·
1 Parent(s): 83b84e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +228 -166
app.py CHANGED
@@ -1,12 +1,12 @@
1
  import os
2
  import time
3
- import datetime
4
  from pathlib import Path
5
- from typing import List
6
  from PIL import Image
7
- from PyPDF2 import PdfReader
8
  import streamlit as st
9
  import pandas as pd
 
 
10
  from docling.document_converter import DocumentConverter, PdfFormatOption
11
  from docling.datamodel.base_models import InputFormat
12
  from docling.datamodel.document import ConversionStatus
@@ -24,70 +24,47 @@ from docling.datamodel.pipeline_options import (
24
  )
25
  from docling_core.types.doc import PictureItem, TableItem
26
 
27
- # Répertoires de sortie
28
  OUTPUT_DIR = Path("output")
29
- OUTPUT_DIR.mkdir(exist_ok=True)
30
-
31
  FIGURES_DIR = OUTPUT_DIR / "figures"
32
- FIGURES_DIR.mkdir(exist_ok=True)
33
-
34
  TABLES_DIR = OUTPUT_DIR / "tables"
35
- TABLES_DIR.mkdir(exist_ok=True)
36
 
37
- # Vérification de validité des fichiers
38
- def is_valid_file(file_path):
39
- try:
40
- if file_path.suffix.lower() in [".pdf", ".docx", ".pptx", ".html", ".png", ".jpg"]:
41
- return True
42
- else:
43
- st.error(f" Format non supporté : {file_path.suffix}")
44
- return False
45
- except Exception as e:
46
- st.error(f"❌ Erreur lors de la vérification du fichier : {e}")
47
- return False
48
-
49
- # Fonction pour configurer le convertisseur de documents
50
- def create_document_converter(
51
- use_ocr: bool,
52
- export_figures: bool,
53
- export_tables: bool,
54
- accelerator: str,
55
- ocr_engine: str,
56
- table_mode: str,
57
- ocr_languages: List[str],
58
- ) -> DocumentConverter:
59
  accelerator_options = AcceleratorOptions(
60
  num_threads=8,
61
- device=AcceleratorDevice[accelerator.upper()],
62
  )
63
 
64
  table_structure_options = TableStructureOptions(
65
- mode=TableFormerMode[table_mode.upper()],
66
- do_cell_matching=True,
67
  )
68
 
69
- # OCR avec le moteur choisi
70
- if ocr_engine == "easyocr":
71
- ocr_options = EasyOcrOptions(lang=ocr_languages)
72
- elif ocr_engine == "tesseract_cli":
73
- ocr_options = TesseractCliOcrOptions(lang=ocr_languages)
74
- elif ocr_engine == "tesserocr":
75
- ocr_options = TesseractOcrOptions(lang=ocr_languages)
76
- elif ocr_engine == "rapidocr":
77
- ocr_options = RapidOcrOptions(lang=ocr_languages)
78
- elif ocr_engine == "ocrmac":
79
- ocr_options = OcrMacOptions(lang=ocr_languages)
80
- else:
81
- raise ValueError(f"Moteur OCR non pris en charge : {ocr_engine}")
82
 
83
  pipeline_options = PdfPipelineOptions(
84
- do_ocr=use_ocr,
85
  generate_page_images=True,
86
- generate_picture_images=export_figures,
87
- generate_table_images=export_tables,
88
  accelerator_options=accelerator_options,
89
  table_structure_options=table_structure_options,
90
- ocr_options=ocr_options,
91
  )
92
 
93
  return DocumentConverter(
@@ -96,130 +73,215 @@ def create_document_converter(
96
  InputFormat.DOCX,
97
  InputFormat.PPTX,
98
  InputFormat.HTML,
99
- InputFormat.IMAGE,
100
  ],
101
- format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)},
102
  )
103
 
104
- # Interface utilisateur avec Streamlit
105
- st.title("📊 Docling document converter ")
106
- st.subheader("📤 Téléchargez un, ou plusieurs document pour commencer le traitement")
 
 
 
 
 
 
107
 
108
- uploaded_files = st.file_uploader(
109
- "Sélectionnez vos fichiers (PDF, DOCX, PPTX, HTML, Images)", accept_multiple_files=True
110
- )
111
- use_ocr = st.checkbox("👁️‍🗨️ Activer l'OCR", value=True)
112
- export_figures = st.checkbox("🖼️ Exporter les images", value=True)
113
- export_tables = st.checkbox("📋 Exporter les tableaux", value=True)
114
- ocr_engine = st.selectbox("Moteur OCR", ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"])
115
- ocr_languages = st.text_input("Langues OCR (ex : en, fr)", "en").split(",")
116
- table_mode = st.selectbox("Mode des tableaux", ["ACCURATE", "FAST"])
117
- export_formats = st.multiselect(
118
- "Formats d'exportation", ["json", "yaml", "md", "multimodal"], default=["md"]
119
- )
120
 
121
- if st.button("Convertir"):
122
- if uploaded_files:
123
- input_paths = []
124
- generated_files = []
125
- figures = []
126
- tables = []
127
- total_files = len(uploaded_files)
128
- start_time = time.time() # Chronomètre de démarrage
129
-
130
- # Charger les fichiers téléchargés
131
- for uploaded_file in uploaded_files:
132
  file_path = OUTPUT_DIR / uploaded_file.name
133
- with open(file_path, "wb") as f:
134
- f.write(uploaded_file.read())
135
- st.write(f"📥 Fichier reçu : `{file_path.name}` ({os.path.getsize(file_path)} octets)")
136
-
137
  if not is_valid_file(file_path):
138
  continue
139
- input_paths.append(file_path)
140
-
141
- # Configurer le convertisseur
142
- converter = create_document_converter(
143
- use_ocr,
144
- export_figures,
145
- export_tables,
146
- accelerator="cpu",
147
- ocr_engine=ocr_engine,
148
- table_mode=table_mode,
149
- ocr_languages=ocr_languages,
150
- )
151
-
152
- # Barre de progression
153
- progress_bar = st.progress(0)
154
- status_placeholder = st.empty()
155
-
156
- # Conversion des fichiers
157
- for i, file_path in enumerate(input_paths):
158
- status_placeholder.info(
159
- f"🔄 Traitement de `{file_path.name}` ({i + 1}/{total_files})"
160
- )
161
-
162
- # Conversion du fichier
163
  conv_results = list(converter.convert_all([file_path], raises_on_error=False))
 
164
  for conv_res in conv_results:
165
  if conv_res.status == ConversionStatus.SUCCESS:
166
- st.success(f"✅ Conversion réussie : `{conv_res.input.file}`")
167
-
168
- # Exporter les résultats
169
- for fmt in export_formats:
170
- output_file = OUTPUT_DIR / f"{conv_res.input.file.stem}.{fmt}"
171
- if fmt == "md":
172
- with open(output_file, "w") as f:
173
- f.write(conv_res.document.export_to_markdown())
174
- elif fmt == "json":
175
- with open(output_file, "w", encoding="utf-8") as f:
176
- json.dump(conv_res.document.export_to_dict(), f, ensure_ascii=False, indent=2)
177
- elif fmt == "yaml":
178
- with open(output_file, "w", encoding="utf-8") as f:
179
- yaml.dump(conv_res.document.export_to_dict(), f, allow_unicode=True)
180
- generated_files.append(output_file)
181
-
182
- # Export des figures et tables
183
- for element, _ in conv_res.document.iterate_items():
184
- if isinstance(element, PictureItem):
185
- fig_path = FIGURES_DIR / f"{conv_res.input.file.stem}_figure.png"
186
- element.image.pil_image.save(fig_path)
187
- figures.append(fig_path)
188
- elif isinstance(element, TableItem):
189
- table_path = TABLES_DIR / f"{conv_res.input.file.stem}_table.csv"
190
- table_df = element.export_to_dataframe()
191
- table_df.to_csv(table_path, index=False)
192
- tables.append(table_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  else:
194
- st.error(f"❌ Échec de la conversion pour : `{conv_res.input.file}`")
195
-
196
- # Mise à jour de la barre de progression
197
- progress_bar.progress((i + 1) / total_files)
198
-
199
- # Affichage des fichiers générés
200
- st.subheader("📂 Fichiers générés")
201
- for generated_file in generated_files:
202
- st.markdown(f"📄 **{generated_file.name}**")
203
- with open(generated_file, "r") as f:
204
- content = f.read()
205
- st.text_area(f"Prévisualisation : {generated_file.name}", value=content, height=200)
206
-
207
- # Affichage des figures extraites
208
- if figures:
209
- st.subheader("🖼️ Figures extraites")
210
- for fig in figures:
211
- st.image(Image.open(fig), caption=fig.name)
212
-
213
- # Affichage des tableaux extraits
214
- if tables:
215
- st.subheader("📋 Tableaux extraits")
216
- for table in tables:
217
- st.markdown(f"📄 **{table.name}**")
218
- table_df = pd.read_csv(table)
219
- st.dataframe(table_df)
220
-
221
- # Temps total écoulé
222
- total_time = time.time() - start_time
223
- st.success(f"✅ Conversion terminée en {int(total_time)} secondes !")
224
- else:
225
- st.error("❌ Veuillez télécharger au moins un fichier.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import time
 
3
  from pathlib import Path
4
+ from typing import List, Dict
5
  from PIL import Image
 
6
  import streamlit as st
7
  import pandas as pd
8
+ import json
9
+ import yaml
10
  from docling.document_converter import DocumentConverter, PdfFormatOption
11
  from docling.datamodel.base_models import InputFormat
12
  from docling.datamodel.document import ConversionStatus
 
24
  )
25
  from docling_core.types.doc import PictureItem, TableItem
26
 
27
+ # Configuration des répertoires
28
  OUTPUT_DIR = Path("output")
 
 
29
  FIGURES_DIR = OUTPUT_DIR / "figures"
 
 
30
  TABLES_DIR = OUTPUT_DIR / "tables"
 
31
 
32
+ def setup_directories():
33
+ OUTPUT_DIR.mkdir(exist_ok=True)
34
+ FIGURES_DIR.mkdir(exist_ok=True)
35
+ TABLES_DIR.mkdir(exist_ok=True)
36
+
37
+ def is_valid_file(file_path: Path) -> bool:
38
+ valid_extensions = [".pdf", ".docx", ".pptx", ".html", ".png", ".jpg"]
39
+ return file_path.suffix.lower() in valid_extensions
40
+
41
+ def create_document_converter(config: Dict) -> DocumentConverter:
 
 
 
 
 
 
 
 
 
 
 
 
42
  accelerator_options = AcceleratorOptions(
43
  num_threads=8,
44
+ device=AcceleratorDevice[config['accelerator'].upper()]
45
  )
46
 
47
  table_structure_options = TableStructureOptions(
48
+ mode=TableFormerMode[config['table_mode'].upper()],
49
+ do_cell_matching=True
50
  )
51
 
52
+ ocr_engines = {
53
+ "easyocr": EasyOcrOptions(lang=config['ocr_languages']),
54
+ "tesseract_cli": TesseractCliOcrOptions(lang=config['ocr_languages']),
55
+ "tesserocr": TesseractOcrOptions(lang=config['ocr_languages']),
56
+ "rapidocr": RapidOcrOptions(lang=config['ocr_languages']),
57
+ "ocrmac": OcrMacOptions(lang=config['ocr_languages'])
58
+ }
 
 
 
 
 
 
59
 
60
  pipeline_options = PdfPipelineOptions(
61
+ do_ocr=config['use_ocr'],
62
  generate_page_images=True,
63
+ generate_picture_images=config['export_figures'],
64
+ generate_table_images=config['export_tables'],
65
  accelerator_options=accelerator_options,
66
  table_structure_options=table_structure_options,
67
+ ocr_options=ocr_engines[config['ocr_engine']]
68
  )
69
 
70
  return DocumentConverter(
 
73
  InputFormat.DOCX,
74
  InputFormat.PPTX,
75
  InputFormat.HTML,
76
+ InputFormat.IMAGE
77
  ],
78
+ format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
79
  )
80
 
81
+ def process_files(uploaded_files, config: Dict) -> Dict:
82
+ setup_directories()
83
+ converter = create_document_converter(config)
84
+ results = {
85
+ 'figures': [],
86
+ 'tables_csv': [],
87
+ 'tables_html': [],
88
+ 'exports': {fmt: [] for fmt in config['export_formats']}
89
+ }
90
 
91
+ progress_bar = st.progress(0)
92
+ status_placeholder = st.empty()
93
+ start_time = time.time()
 
 
 
 
 
 
 
 
 
94
 
95
+ for idx, uploaded_file in enumerate(uploaded_files):
96
+ try:
 
 
 
 
 
 
 
 
 
97
  file_path = OUTPUT_DIR / uploaded_file.name
98
+ file_path.write_bytes(uploaded_file.getbuffer())
99
+
 
 
100
  if not is_valid_file(file_path):
101
  continue
102
+
103
+ status_placeholder.info(f"Traitement de {file_path.name} ({idx+1}/{len(uploaded_files)})")
104
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  conv_results = list(converter.convert_all([file_path], raises_on_error=False))
106
+
107
  for conv_res in conv_results:
108
  if conv_res.status == ConversionStatus.SUCCESS:
109
+ handle_successful_conversion(conv_res, results, config['export_formats'])
110
+
111
+ progress_bar.progress((idx + 1) / len(uploaded_files))
112
+
113
+ except Exception as e:
114
+ st.error(f"Erreur avec {uploaded_file.name}: {str(e)}")
115
+
116
+ results['processing_time'] = time.time() - start_time
117
+ return results
118
+
119
+ def handle_successful_conversion(conv_res, results: Dict, export_formats: List[str]):
120
+ # Export des formats de document
121
+ for fmt in export_formats:
122
+ output_file = OUTPUT_DIR / f"{conv_res.input.file.stem}.{fmt}"
123
+ with open(output_file, "w") as f:
124
+ if fmt == "md":
125
+ content = conv_res.document.export_to_markdown()
126
+ f.write(content)
127
+ results['exports']['md'].append((output_file, content))
128
+ elif fmt == "json":
129
+ content = json.dumps(conv_res.document.export_to_dict(), ensure_ascii=False, indent=2)
130
+ f.write(content)
131
+ results['exports']['json'].append((output_file, content))
132
+ elif fmt == "yaml":
133
+ content = yaml.dump(conv_res.document.export_to_dict(), allow_unicode=True)
134
+ f.write(content)
135
+ results['exports']['yaml'].append((output_file, content))
136
+ elif fmt == "multimodal":
137
+ results['exports']['multimodal'].append(output_file)
138
+
139
+ # Extraction des éléments
140
+ for element, _ in conv_res.document.iterate_items():
141
+ if isinstance(element, PictureItem):
142
+ handle_picture_element(element, conv_res, results)
143
+ elif isinstance(element, TableItem):
144
+ handle_table_element(element, conv_res, results)
145
+
146
+ def handle_picture_element(element: PictureItem, conv_res, results: Dict):
147
+ fig_path = FIGURES_DIR / f"{conv_res.input.file.stem}_figure_{len(results['figures'])}.png"
148
+ element.image.pil_image.save(fig_path)
149
+ results['figures'].append(fig_path)
150
+
151
+ def handle_table_element(element: TableItem, conv_res, results: Dict):
152
+ csv_path = TABLES_DIR / f"{conv_res.input.file.stem}_table_{len(results['tables_csv'])}.csv"
153
+ element.export_to_dataframe().to_csv(csv_path, index=False)
154
+ results['tables_csv'].append(csv_path)
155
+
156
+ html_path = TABLES_DIR / f"{conv_res.input.file.stem}_table_{len(results['tables_html'])}.html"
157
+ with open(html_path, "w") as f:
158
+ f.write(element.export_to_html())
159
+ results['tables_html'].append(html_path)
160
+
161
+ def display_export_content(title: str, content: str, format: str):
162
+ with st.expander(f"📄 {title}"):
163
+ if format == "md":
164
+ st.markdown(content)
165
+ elif format in ["json", "yaml"]:
166
+ st.code(content, language=format)
167
+ elif format == "multimodal":
168
+ st.info("Affichage multimodal combinant texte, images et tableaux")
169
+ st.markdown(content)
170
+
171
+ def display_results(results: Dict):
172
+ st.session_state.time_placeholder.success(f"⏱ Temps total de conversion : {int(results['processing_time'])} secondes")
173
+
174
+ # Affichage des exports
175
+ for fmt, exports in results['exports'].items():
176
+ if exports:
177
+ st.subheader(f"📁 Exports {fmt.upper()}")
178
+ for export in exports:
179
+ if fmt == "multimodal":
180
+ display_multimodal_result(export)
181
  else:
182
+ file_path, content = export
183
+ display_export_content(file_path.name, content, fmt)
184
+
185
+ # Section des figures
186
+ if results['figures']:
187
+ st.subheader("🖼️ Figures extraites")
188
+ cols = st.columns(3)
189
+ for idx, fig_path in enumerate(results['figures']):
190
+ try:
191
+ cols[idx % 3].image(Image.open(fig_path), caption=fig_path.name, use_container_width=True)
192
+ except Exception as e:
193
+ cols[idx % 3].error(f"Erreur d'affichage de {fig_path.name}")
194
+
195
+ # Section des tableaux
196
+ if results['tables_csv'] or results['tables_html']:
197
+ st.subheader("📋 Tableaux extraits")
198
+ display_format = st.radio("Format d'affichage", ['CSV', 'HTML'], horizontal=True)
199
+
200
+ if display_format == 'CSV':
201
+ for table_path in results['tables_csv']:
202
+ try:
203
+ df = pd.read_csv(table_path)
204
+ st.write(f"**{table_path.stem}**")
205
+ st.dataframe(df.style.set_properties(**{'text-align': 'left'}))
206
+ except Exception as e:
207
+ st.error(f"Erreur de lecture CSV {table_path.name}: {str(e)}")
208
+ else:
209
+ for html_path in results['tables_html']:
210
+ try:
211
+ with open(html_path, "r") as f:
212
+ st.write(f"**{html_path.stem}**")
213
+ st.markdown(f.read(), unsafe_allow_html=True)
214
+ except Exception as e:
215
+ st.error(f"Erreur de lecture HTML {html_path.name}: {str(e)}")
216
+
217
+ def display_multimodal_result(file_path: Path):
218
+ with st.expander(f"🌈 {file_path.name}"):
219
+ col1, col2 = st.columns([2, 1])
220
+
221
+ with col1:
222
+ try:
223
+ with open(file_path, "r") as f:
224
+ content = f.read()
225
+ st.markdown(content)
226
+ except Exception as e:
227
+ st.error(f"Erreur de lecture : {str(e)}")
228
+
229
+ with col2:
230
+ related_files = [
231
+ f for f in OUTPUT_DIR.glob(f"{file_path.stem}*")
232
+ if f != file_path and not f.is_dir()
233
+ ]
234
+
235
+ if related_files:
236
+ st.write("Fichiers associés :")
237
+ for f in related_files:
238
+ st.write(f"- `{f.name}`")
239
+ if f.suffix in [".png", ".jpg"]:
240
+ st.image(Image.open(f), use_column_width=True)
241
+ elif f.suffix == ".csv":
242
+ try:
243
+ st.dataframe(pd.read_csv(f).head(3))
244
+ except Exception as e:
245
+ st.error(f"Erreur d'affichage CSV : {str(e)}")
246
+
247
+ # Interface utilisateur
248
+ def main():
249
+ st.title("📊🦆 Docling Document Converter")
250
+ st.session_state.time_placeholder = st.empty()
251
+
252
+ uploaded_files = st.file_uploader(
253
+ "Téléchargez vos documents",
254
+ accept_multiple_files=True,
255
+ type=["pdf", "docx", "pptx", "html", "png", "jpg"]
256
+ )
257
+
258
+ with st.expander("Options avancées"):
259
+ config = {
260
+ 'use_ocr': st.checkbox("Activer OCR", True),
261
+ 'export_figures': st.checkbox("Exporter les images", True),
262
+ 'export_tables': st.checkbox("Exporter les tableaux", True),
263
+ 'ocr_engine': st.selectbox("Moteur OCR", ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"]),
264
+ 'ocr_languages': st.text_input("Langues OCR (séparées par des virgules)", "en").split(','),
265
+ 'table_mode': st.selectbox("Mode des tableaux", ["ACCURATE", "FAST"]),
266
+ 'export_formats': st.multiselect(
267
+ "Formats d'export",
268
+ ["json", "yaml", "md", "multimodal"],
269
+ default=["md"]
270
+ ),
271
+ 'accelerator': st.selectbox("Accélérateur matériel", ["cpu", "cuda", "mps"], index=0)
272
+ }
273
+
274
+ if st.button("Démarrer la conversion"):
275
+ if uploaded_files:
276
+ results = process_files(uploaded_files, config)
277
+ display_results(results)
278
+ st.success("✅ Conversion terminée avec succès !")
279
+ st.download_button("Télécharger tous les résultats",
280
+ data=os.path.join(OUTPUT_DIR),
281
+ file_name="conversion_results.zip",
282
+ mime="application/zip")
283
+ else:
284
+ st.error("⚠️ Veuillez télécharger au moins un fichier")
285
+
286
+ if __name__ == "__main__":
287
+ main()