Spaces:

nanoapple
/

LangmyOCR

Sleeping

App Files Files Community

nanoapple commited on 20 days ago

Commit

65e2c9b

verified ·

1 Parent(s): 3546c19

Upload app.py

Browse files

Files changed (1) hide show

app.py +65 -0

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import streamlit as st
+import tempfile
+import os
+import subprocess
+import json
+import time
+import fitz  # PyMuPDF
+import langextract as lx
+# ------------------ 环境配置 ------------------
+API_KEY = os.getenv("LANGEXTRACT_API_KEY")
+if API_KEY:
+    try:
+        lx.configure(api_key=API_KEY)
+    except Exception as e:
+        st.warning(f"⚠️ LangExtract 配置失败：{e}")
+# ------------------ 页面设置 ------------------
+st.set_page_config(page_title="OCR + LangExtract", page_icon="📄")
+st.title("📄 OCR + LangExtract Demo")
+st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
+# ------------------ 文件上传 ------------------
+uploaded_file = st.file_uploader("选择 PDF 文件（≤ 5 页）", type=["pdf"])
+if uploaded_file is not None:
+    try:
+        with st.spinner("🚀 正在处理 PDF，请稍候..."):
+            # 保存到临时文件
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+                tmp_file.write(uploaded_file.read())
+                tmp_file_path = tmp_file.name
+            # OCR 输出文件路径
+            ocr_output_path = tmp_file_path.replace(".pdf", "_ocr.pdf")
+            # 执行 OCRmyPDF（假设已安装）
+            subprocess.run(["ocrmypdf", tmp_file_path, ocr_output_path, "--force-ocr", "--deskew"],
+                           check=True, capture_output=True)
+            # 提取 OCR 后的文本
+            with fitz.open(ocr_output_path) as doc:
+                text = ""
+                for page in doc:
+                    text += page.get_text()
+            # 结构化提取
+            result = lx.extract(text=text)
+        # 显示结果
+        st.success("✅ 提取完成，以下为结构化内容：")
+        with st.expander("📦 展开查看 JSON"):
+            st.json(result)
+        # 提供下载
+        st.download_button("📥 下载 JSON 文件", data=json.dumps(result, indent=2),
+                           file_name="result.json", mime="application/json")
+    except subprocess.CalledProcessError as e:
+        st.error("❌ OCR 处理失败。请确认文件为扫描件 PDF，且 ocrmypdf 已正确安装。")
+        st.text(e.stderr.decode())
+    except Exception as e:
+        st.error(f"❌ 发生错误：{e}")