nanoapple commited on
Commit
e4a5925
·
verified ·
1 Parent(s): 8c05719

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -4
app.py CHANGED
@@ -25,9 +25,20 @@ def run_ocr(pdf_file, langs: str, rotate_pages: bool, deskew: bool, clean: bool,
25
  st.error("系统未检测到 ocrmypdf,可检查 Docker/依赖安装。")
26
  return None, None, None
27
 
 
 
 
 
 
 
 
 
 
 
 
28
  # 保存上传文件到临时路径
29
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
30
- tmp.write(pdf_file.read())
31
  in_path = Path(tmp.name)
32
 
33
  work = Path(tempfile.mkdtemp(prefix="ocr_"))
@@ -91,7 +102,7 @@ def run_extract(sidecar_text: str, provider: str, model_id: str, prompt: str):
91
  # 最小 few-shot 示例(演示用;生产请替换为你的法律 schema)
92
  examples = [
93
  lx.data.ExampleData(
94
- text="On 12/03/2023, ABC Pty Ltd terminated Ms Wangs employment.",
95
  extractions=[
96
  lx.data.Extraction(
97
  extraction_class="party",
@@ -143,7 +154,7 @@ def run_extract(sidecar_text: str, provider: str, model_id: str, prompt: str):
143
  with st.sidebar:
144
  st.header("参数")
145
 
146
- # 用 form 把“上传 + 参数 + 提交”打包,避免按钮重跑导致 file_uploader 丢值
147
  with st.form("run_form", clear_on_submit=False):
148
  pdf = st.file_uploader("上传扫描 PDF", type=["pdf"], accept_multiple_files=False, key="pdf_uploader")
149
 
@@ -194,6 +205,12 @@ if "pdf_uploader" in st.session_state and st.session_state["pdf_uploader"]:
194
  f"({st.session_state['pdf_uploader'].size/1024:.1f} KB)")
195
 
196
  if submitted:
 
 
 
 
 
 
197
  out_pdf, sidecar_path, preview = run_ocr(
198
  pdf, langs, rotate_pages, deskew, clean, optimize_level,
199
  force_ocr, skip_text, export_sidecar
@@ -215,4 +232,4 @@ if submitted:
215
  st.components.v1.html(html_content, height=650, scrolling=True)
216
  if jsonl_path and Path(jsonl_path).exists():
217
  with open(jsonl_path, "rb") as f:
218
- jsonl_slot.download_button("下载抽取结果 JSONL", f, file_name="extractions.jsonl")
 
25
  st.error("系统未检测到 ocrmypdf,可检查 Docker/依赖安装。")
26
  return None, None, None
27
 
28
+ # 修复:重置文件指针到开头,然后读取内容
29
+ try:
30
+ pdf_file.seek(0) # 重置文件指针
31
+ pdf_content = pdf_file.read()
32
+ if not pdf_content:
33
+ st.error("PDF 文件内容为空。")
34
+ return None, None, None
35
+ except Exception as e:
36
+ st.error(f"读取 PDF 文件失败:{e}")
37
+ return None, None, None
38
+
39
  # 保存上传文件到临时路径
40
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
41
+ tmp.write(pdf_content) # 使用读取到的内容
42
  in_path = Path(tmp.name)
43
 
44
  work = Path(tempfile.mkdtemp(prefix="ocr_"))
 
102
  # 最小 few-shot 示例(演示用;生产请替换为你的法律 schema)
103
  examples = [
104
  lx.data.ExampleData(
105
+ text="On 12/03/2023, ABC Pty Ltd terminated Ms Wang's employment.",
106
  extractions=[
107
  lx.data.Extraction(
108
  extraction_class="party",
 
154
  with st.sidebar:
155
  st.header("参数")
156
 
157
+ # 用 form 把"上传 + 参数 + 提交"打包,避免按钮重跑导致 file_uploader 丢值
158
  with st.form("run_form", clear_on_submit=False):
159
  pdf = st.file_uploader("上传扫描 PDF", type=["pdf"], accept_multiple_files=False, key="pdf_uploader")
160
 
 
205
  f"({st.session_state['pdf_uploader'].size/1024:.1f} KB)")
206
 
207
  if submitted:
208
+ # 添加调试信息
209
+ if pdf is None:
210
+ st.error("PDF 为 None - 检查文件上传")
211
+ else:
212
+ st.info(f"PDF 文件信息:名称={pdf.name}, 大小={pdf.size} bytes")
213
+
214
  out_pdf, sidecar_path, preview = run_ocr(
215
  pdf, langs, rotate_pages, deskew, clean, optimize_level,
216
  force_ocr, skip_text, export_sidecar
 
232
  st.components.v1.html(html_content, height=650, scrolling=True)
233
  if jsonl_path and Path(jsonl_path).exists():
234
  with open(jsonl_path, "rb") as f:
235
+ jsonl_slot.download_button("下载抽取结果 JSONL", f, file_name="extractions.jsonl")