nanoapple commited on
Commit
65e2c9b
·
verified ·
1 Parent(s): 3546c19

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -0
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ import tempfile
4
+ import os
5
+ import subprocess
6
+ import json
7
+ import time
8
+ import fitz # PyMuPDF
9
+ import langextract as lx
10
+
11
+ # ------------------ 环境配置 ------------------
12
+ API_KEY = os.getenv("LANGEXTRACT_API_KEY")
13
+ if API_KEY:
14
+ try:
15
+ lx.configure(api_key=API_KEY)
16
+ except Exception as e:
17
+ st.warning(f"⚠️ LangExtract 配置失败:{e}")
18
+
19
+ # ------------------ 页面设置 ------------------
20
+ st.set_page_config(page_title="OCR + LangExtract", page_icon="📄")
21
+ st.title("📄 OCR + LangExtract Demo")
22
+ st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
23
+
24
+ # ------------------ 文件上传 ------------------
25
+ uploaded_file = st.file_uploader("选择 PDF 文件(≤ 5 页)", type=["pdf"])
26
+
27
+ if uploaded_file is not None:
28
+ try:
29
+ with st.spinner("🚀 正在处理 PDF,请稍候..."):
30
+ # 保存到临时文件
31
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
32
+ tmp_file.write(uploaded_file.read())
33
+ tmp_file_path = tmp_file.name
34
+
35
+ # OCR 输出文件路径
36
+ ocr_output_path = tmp_file_path.replace(".pdf", "_ocr.pdf")
37
+
38
+ # 执行 OCRmyPDF(假设已安装)
39
+ subprocess.run(["ocrmypdf", tmp_file_path, ocr_output_path, "--force-ocr", "--deskew"],
40
+ check=True, capture_output=True)
41
+
42
+ # 提取 OCR 后的文本
43
+ with fitz.open(ocr_output_path) as doc:
44
+ text = ""
45
+ for page in doc:
46
+ text += page.get_text()
47
+
48
+ # 结构化提取
49
+ result = lx.extract(text=text)
50
+
51
+ # 显示结果
52
+ st.success("✅ 提取完成,以下为结构化内容:")
53
+ with st.expander("📦 展开查看 JSON"):
54
+ st.json(result)
55
+
56
+ # 提供下载
57
+ st.download_button("📥 下载 JSON 文件", data=json.dumps(result, indent=2),
58
+ file_name="result.json", mime="application/json")
59
+
60
+ except subprocess.CalledProcessError as e:
61
+ st.error("❌ OCR 处理失败。请确认文件为扫描件 PDF,且 ocrmypdf 已正确安装。")
62
+ st.text(e.stderr.decode())
63
+
64
+ except Exception as e:
65
+ st.error(f"❌ 发生错误:{e}")