Update app.py
Browse files
app.py
CHANGED
@@ -25,9 +25,20 @@ def run_ocr(pdf_file, langs: str, rotate_pages: bool, deskew: bool, clean: bool,
|
|
25 |
st.error("系统未检测到 ocrmypdf,可检查 Docker/依赖安装。")
|
26 |
return None, None, None
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
# 保存上传文件到临时路径
|
29 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
30 |
-
tmp.write(
|
31 |
in_path = Path(tmp.name)
|
32 |
|
33 |
work = Path(tempfile.mkdtemp(prefix="ocr_"))
|
@@ -91,7 +102,7 @@ def run_extract(sidecar_text: str, provider: str, model_id: str, prompt: str):
|
|
91 |
# 最小 few-shot 示例(演示用;生产请替换为你的法律 schema)
|
92 |
examples = [
|
93 |
lx.data.ExampleData(
|
94 |
-
text="On 12/03/2023, ABC Pty Ltd terminated Ms Wang
|
95 |
extractions=[
|
96 |
lx.data.Extraction(
|
97 |
extraction_class="party",
|
@@ -143,7 +154,7 @@ def run_extract(sidecar_text: str, provider: str, model_id: str, prompt: str):
|
|
143 |
with st.sidebar:
|
144 |
st.header("参数")
|
145 |
|
146 |
-
# 用 form
|
147 |
with st.form("run_form", clear_on_submit=False):
|
148 |
pdf = st.file_uploader("上传扫描 PDF", type=["pdf"], accept_multiple_files=False, key="pdf_uploader")
|
149 |
|
@@ -194,6 +205,12 @@ if "pdf_uploader" in st.session_state and st.session_state["pdf_uploader"]:
|
|
194 |
f"({st.session_state['pdf_uploader'].size/1024:.1f} KB)")
|
195 |
|
196 |
if submitted:
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
out_pdf, sidecar_path, preview = run_ocr(
|
198 |
pdf, langs, rotate_pages, deskew, clean, optimize_level,
|
199 |
force_ocr, skip_text, export_sidecar
|
@@ -215,4 +232,4 @@ if submitted:
|
|
215 |
st.components.v1.html(html_content, height=650, scrolling=True)
|
216 |
if jsonl_path and Path(jsonl_path).exists():
|
217 |
with open(jsonl_path, "rb") as f:
|
218 |
-
jsonl_slot.download_button("下载抽取结果 JSONL", f, file_name="extractions.jsonl")
|
|
|
25 |
st.error("系统未检测到 ocrmypdf,可检查 Docker/依赖安装。")
|
26 |
return None, None, None
|
27 |
|
28 |
+
# 修复:重置文件指针到开头,然后读取内容
|
29 |
+
try:
|
30 |
+
pdf_file.seek(0) # 重置文件指针
|
31 |
+
pdf_content = pdf_file.read()
|
32 |
+
if not pdf_content:
|
33 |
+
st.error("PDF 文件内容为空。")
|
34 |
+
return None, None, None
|
35 |
+
except Exception as e:
|
36 |
+
st.error(f"读取 PDF 文件失败:{e}")
|
37 |
+
return None, None, None
|
38 |
+
|
39 |
# 保存上传文件到临时路径
|
40 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
41 |
+
tmp.write(pdf_content) # 使用读取到的内容
|
42 |
in_path = Path(tmp.name)
|
43 |
|
44 |
work = Path(tempfile.mkdtemp(prefix="ocr_"))
|
|
|
102 |
# 最小 few-shot 示例(演示用;生产请替换为你的法律 schema)
|
103 |
examples = [
|
104 |
lx.data.ExampleData(
|
105 |
+
text="On 12/03/2023, ABC Pty Ltd terminated Ms Wang's employment.",
|
106 |
extractions=[
|
107 |
lx.data.Extraction(
|
108 |
extraction_class="party",
|
|
|
154 |
with st.sidebar:
|
155 |
st.header("参数")
|
156 |
|
157 |
+
# 用 form 把"上传 + 参数 + 提交"打包,避免按钮重跑导致 file_uploader 丢值
|
158 |
with st.form("run_form", clear_on_submit=False):
|
159 |
pdf = st.file_uploader("上传扫描 PDF", type=["pdf"], accept_multiple_files=False, key="pdf_uploader")
|
160 |
|
|
|
205 |
f"({st.session_state['pdf_uploader'].size/1024:.1f} KB)")
|
206 |
|
207 |
if submitted:
|
208 |
+
# 添加调试信息
|
209 |
+
if pdf is None:
|
210 |
+
st.error("PDF 为 None - 检查文件上传")
|
211 |
+
else:
|
212 |
+
st.info(f"PDF 文件信息:名称={pdf.name}, 大小={pdf.size} bytes")
|
213 |
+
|
214 |
out_pdf, sidecar_path, preview = run_ocr(
|
215 |
pdf, langs, rotate_pages, deskew, clean, optimize_level,
|
216 |
force_ocr, skip_text, export_sidecar
|
|
|
232 |
st.components.v1.html(html_content, height=650, scrolling=True)
|
233 |
if jsonl_path and Path(jsonl_path).exists():
|
234 |
with open(jsonl_path, "rb") as f:
|
235 |
+
jsonl_slot.download_button("下载抽取结果 JSONL", f, file_name="extractions.jsonl")
|