Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
import os
|
2 |
import time
|
|
|
3 |
from pathlib import Path
|
4 |
from typing import List
|
5 |
from PyPDF2 import PdfReader
|
6 |
import streamlit as st
|
|
|
7 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
8 |
from docling.datamodel.base_models import InputFormat
|
9 |
from docling.datamodel.document import ConversionStatus
|
@@ -19,18 +21,28 @@ from docling.datamodel.pipeline_options import (
|
|
19 |
RapidOcrOptions,
|
20 |
OcrMacOptions,
|
21 |
)
|
|
|
22 |
|
23 |
# Répertoires de sortie
|
24 |
OUTPUT_DIR = Path("output")
|
25 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
26 |
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
try:
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
32 |
except Exception as e:
|
33 |
-
st.error(f"
|
34 |
return False
|
35 |
|
36 |
# Fonction pour configurer le convertisseur de documents
|
@@ -53,7 +65,7 @@ def create_document_converter(
|
|
53 |
do_cell_matching=True,
|
54 |
)
|
55 |
|
56 |
-
# OCR avec
|
57 |
if ocr_engine == "easyocr":
|
58 |
ocr_options = EasyOcrOptions(lang=ocr_languages)
|
59 |
elif ocr_engine == "tesseract_cli":
|
@@ -88,12 +100,12 @@ def create_document_converter(
|
|
88 |
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)},
|
89 |
)
|
90 |
|
91 |
-
# Interface Streamlit
|
92 |
-
st.title("📄 Conversion de documents
|
93 |
-
st.subheader("🖼️ Téléchargez un
|
94 |
|
95 |
uploaded_files = st.file_uploader(
|
96 |
-
"Sélectionnez vos fichiers PDF", accept_multiple_files=True
|
97 |
)
|
98 |
use_ocr = st.checkbox("👁️🗨️ Activer l'OCR", value=True)
|
99 |
export_figures = st.checkbox("🖼️ Exporter les images", value=True)
|
@@ -101,6 +113,9 @@ export_tables = st.checkbox("📋 Exporter les tableaux", value=True)
|
|
101 |
ocr_engine = st.selectbox("Moteur OCR", ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"])
|
102 |
ocr_languages = st.text_input("Langues OCR (ex : eng, fra)", "eng").split(",")
|
103 |
table_mode = st.selectbox("Mode des tableaux", ["ACCURATE", "FAST"])
|
|
|
|
|
|
|
104 |
|
105 |
if st.button("Convertir"):
|
106 |
if uploaded_files:
|
@@ -116,12 +131,11 @@ if st.button("Convertir"):
|
|
116 |
f.write(uploaded_file.read())
|
117 |
st.write(f"📥 Fichier reçu : `{file_path.name}` ({os.path.getsize(file_path)} octets)")
|
118 |
|
119 |
-
if not
|
120 |
-
st.error(f"❌ Le fichier {file_path.name} n'est pas un PDF valide.")
|
121 |
continue
|
122 |
input_paths.append(file_path)
|
123 |
|
124 |
-
# Configurer le convertisseur
|
125 |
converter = create_document_converter(
|
126 |
use_ocr,
|
127 |
export_figures,
|
@@ -132,7 +146,7 @@ if st.button("Convertir"):
|
|
132 |
ocr_languages=ocr_languages,
|
133 |
)
|
134 |
|
135 |
-
# Barre de progression
|
136 |
progress_bar = st.progress(0)
|
137 |
status_placeholder = st.empty()
|
138 |
|
@@ -148,21 +162,25 @@ if st.button("Convertir"):
|
|
148 |
for conv_res in conv_results:
|
149 |
if conv_res.status == ConversionStatus.SUCCESS:
|
150 |
st.success(f"✅ Conversion réussie : `{conv_res.input.file}`")
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
else:
|
156 |
st.error(f"❌ Échec de la conversion pour : `{conv_res.input.file}`")
|
157 |
|
158 |
-
# Temps estimé restant
|
159 |
-
elapsed_time = time.time() - file_start_time
|
160 |
-
remaining_files = total_files - (i + 1)
|
161 |
-
estimated_time_remaining = elapsed_time * remaining_files
|
162 |
-
status_placeholder.info(
|
163 |
-
f"⏳ Temps restant estimé : {int(estimated_time_remaining)} secondes"
|
164 |
-
)
|
165 |
-
|
166 |
# Mise à jour de la barre de progression
|
167 |
progress_bar.progress((i + 1) / total_files)
|
168 |
|
@@ -178,4 +196,4 @@ if st.button("Convertir"):
|
|
178 |
total_time = time.time() - start_time
|
179 |
st.success(f"✅ Conversion terminée en {int(total_time)} secondes !")
|
180 |
else:
|
181 |
-
st.error("❌ Veuillez télécharger au moins un fichier
|
|
|
1 |
import os
|
2 |
import time
|
3 |
+
import datetime
|
4 |
from pathlib import Path
|
5 |
from typing import List
|
6 |
from PyPDF2 import PdfReader
|
7 |
import streamlit as st
|
8 |
+
import pandas as pd
|
9 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
10 |
from docling.datamodel.base_models import InputFormat
|
11 |
from docling.datamodel.document import ConversionStatus
|
|
|
21 |
RapidOcrOptions,
|
22 |
OcrMacOptions,
|
23 |
)
|
24 |
+
from docling_core.types.doc import PictureItem, TableItem
|
25 |
|
26 |
# Répertoires de sortie
|
27 |
OUTPUT_DIR = Path("output")
|
28 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
29 |
|
30 |
+
FIGURES_DIR = OUTPUT_DIR / "figures"
|
31 |
+
FIGURES_DIR.mkdir(exist_ok=True)
|
32 |
+
|
33 |
+
TABLES_DIR = OUTPUT_DIR / "tables"
|
34 |
+
TABLES_DIR.mkdir(exist_ok=True)
|
35 |
+
|
36 |
+
# Vérification de validité des fichiers
|
37 |
+
def is_valid_file(file_path):
|
38 |
try:
|
39 |
+
if file_path.suffix.lower() in [".pdf", ".docx", ".pptx", ".html", ".png", ".jpg"]:
|
40 |
+
return True
|
41 |
+
else:
|
42 |
+
st.error(f"❌ Format non supporté : {file_path.suffix}")
|
43 |
+
return False
|
44 |
except Exception as e:
|
45 |
+
st.error(f"❌ Erreur lors de la vérification du fichier : {e}")
|
46 |
return False
|
47 |
|
48 |
# Fonction pour configurer le convertisseur de documents
|
|
|
65 |
do_cell_matching=True,
|
66 |
)
|
67 |
|
68 |
+
# OCR avec le moteur choisi
|
69 |
if ocr_engine == "easyocr":
|
70 |
ocr_options = EasyOcrOptions(lang=ocr_languages)
|
71 |
elif ocr_engine == "tesseract_cli":
|
|
|
100 |
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)},
|
101 |
)
|
102 |
|
103 |
+
# Interface utilisateur avec Streamlit
|
104 |
+
st.title("📄 Conversion de documents avec OCR et export multimodal")
|
105 |
+
st.subheader("🖼️ Téléchargez un document pour commencer le traitement")
|
106 |
|
107 |
uploaded_files = st.file_uploader(
|
108 |
+
"Sélectionnez vos fichiers (PDF, DOCX, PPTX, HTML, Images)", accept_multiple_files=True
|
109 |
)
|
110 |
use_ocr = st.checkbox("👁️🗨️ Activer l'OCR", value=True)
|
111 |
export_figures = st.checkbox("🖼️ Exporter les images", value=True)
|
|
|
113 |
ocr_engine = st.selectbox("Moteur OCR", ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"])
|
114 |
ocr_languages = st.text_input("Langues OCR (ex : eng, fra)", "eng").split(",")
|
115 |
table_mode = st.selectbox("Mode des tableaux", ["ACCURATE", "FAST"])
|
116 |
+
export_formats = st.multiselect(
|
117 |
+
"Formats d'exportation", ["json", "yaml", "md", "multimodal"], default=["md"]
|
118 |
+
)
|
119 |
|
120 |
if st.button("Convertir"):
|
121 |
if uploaded_files:
|
|
|
131 |
f.write(uploaded_file.read())
|
132 |
st.write(f"📥 Fichier reçu : `{file_path.name}` ({os.path.getsize(file_path)} octets)")
|
133 |
|
134 |
+
if not is_valid_file(file_path):
|
|
|
135 |
continue
|
136 |
input_paths.append(file_path)
|
137 |
|
138 |
+
# Configurer le convertisseur
|
139 |
converter = create_document_converter(
|
140 |
use_ocr,
|
141 |
export_figures,
|
|
|
146 |
ocr_languages=ocr_languages,
|
147 |
)
|
148 |
|
149 |
+
# Barre de progression
|
150 |
progress_bar = st.progress(0)
|
151 |
status_placeholder = st.empty()
|
152 |
|
|
|
162 |
for conv_res in conv_results:
|
163 |
if conv_res.status == ConversionStatus.SUCCESS:
|
164 |
st.success(f"✅ Conversion réussie : `{conv_res.input.file}`")
|
165 |
+
for fmt in export_formats:
|
166 |
+
output_file = OUTPUT_DIR / f"{conv_res.input.file.stem}.{fmt}"
|
167 |
+
if fmt == "md":
|
168 |
+
with open(output_file, "w") as f:
|
169 |
+
f.write(f"## Contenu converti pour `{conv_res.input.file}`")
|
170 |
+
elif fmt == "json":
|
171 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
172 |
+
json.dump(conv_res.document.export_to_dict(), f, ensure_ascii=False, indent=2)
|
173 |
+
elif fmt == "yaml":
|
174 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
175 |
+
yaml.dump(conv_res.document.export_to_dict(), f, allow_unicode=True)
|
176 |
+
elif fmt == "multimodal":
|
177 |
+
# Simulation d'un export multimodal
|
178 |
+
multimodal_output = OUTPUT_DIR / f"{conv_res.input.file.stem}_multimodal.parquet"
|
179 |
+
st.write(f"Multimodal export simulé : `{multimodal_output}`")
|
180 |
+
generated_files.append(output_file)
|
181 |
else:
|
182 |
st.error(f"❌ Échec de la conversion pour : `{conv_res.input.file}`")
|
183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
# Mise à jour de la barre de progression
|
185 |
progress_bar.progress((i + 1) / total_files)
|
186 |
|
|
|
196 |
total_time = time.time() - start_time
|
197 |
st.success(f"✅ Conversion terminée en {int(total_time)} secondes !")
|
198 |
else:
|
199 |
+
st.error("❌ Veuillez télécharger au moins un fichier.")
|