Spaces:

nanoapple
/

LangmyOCR

Sleeping

App Files Files Community

nanoapple commited on 19 days ago

Commit

07542f7

verified ·

1 Parent(s): 2492dc8

Upload 4 files

Browse files

Files changed (4) hide show

README.md +13 -18
app.py +95 -0
apt.txt +5 -0
requirements.txt +4 -3

README.md CHANGED Viewed

@@ -1,20 +1,15 @@
----
-title: LangmyOCR
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: Processes up-to-five-page PDFs by adding searchable OCR text
-license: mit
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

+# 📄 OCR + LangExtract Web App (Hugging Face Spaces)
+一个极简示例：
+1. 用户上传 ≤ 5 页的 PDF（扫描或数字文档皆可）；
+2. 调用 **OCRmyPDF** 为扫描页添加不可见文字层；
+3. 从全文中提取结构化信息（借助 [LangExtract](https://github.com/google/langextract)）；
+4. 将提取结果在界面中展示，并可下载 JSON。
+> **必备**
+> • 在 **Spaces → Settings → Secrets** 里添加 `LANGEXTRACT_API_KEY`（如果用 Gemini API）。
+> • 资源免费层仅供演示，上传文件 ≤ 5 MB 且 ≤ 5 页以控制内存和执行时间。
+>
+> 如需扩展：
+> • 提示词、示例、模型 ID 都可在 `app.py` 里调整；
+> • 可接入 Postgres / S3 存储历史记录，或增加登录鉴权。

app.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import streamlit as st
+import tempfile, os, subprocess, json, time
+import fitz  # PyMuPDF
+import langextract as lx
+# ------------------- 环境配置 -------------------
+API_KEY = os.getenv("LANGEXTRACT_API_KEY")
+if API_KEY:
+    lx.configure(api_key=API_KEY)
+st.set_page_config(page_title="OCR + LangExtract", page_icon="📄")
+st.title("📄 OCR + LangExtract Demo")
+st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
+# ------------------- 文件上传 -------------------
+uploaded = st.file_uploader("选择 PDF 文件（≤ 5 页）", type=["pdf"])
+if uploaded:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        raw_path = os.path.join(tmpdir, "input.pdf")
+        ocr_path = os.path.join(tmpdir, "ocr.pdf")
+        with open(raw_path, "wb") as f:
+            f.write(uploaded.read())
+        # --------- 页数检测 ---------
+        pages = fitz.open(raw_path).page_count
+        if pages > 5:
+            st.error(f"🚫 当前 {pages} 页，超出限制。请上传 ≤ 5 页的 PDF。")
+            st.stop()
+        # --------- OCR 处理 ---------
+        st.info("⚙️ 正在执行 OCR，请稍候 …")
+        t0 = time.time()
+        try:
+            subprocess.run(
+                [
+                    "ocrmypdf",
+                    raw_path,
+                    ocr_path,
+                    "-l", "eng+chi_sim",     # 英文 + 简体中文
+                    "--skip-text",           # 跳过已有文字页
+                    "--quiet"
+                ],
+                check=True,
+            )
+        except subprocess.CalledProcessError as e:
+            st.error(f"OCR 失败：{e}")
+            st.stop()
+        st.success(f"OCR 完成，用时 {time.time() - t0:.1f}s")
+        # --------- 提取全文 ---------
+        doc = fitz.open(ocr_path)
+        full_text = "\n".join(page.get_text() for page in doc)
+        # --------- LangExtract 结构化 ---------
+        st.info("🔍 LangExtract 正在解析 …")
+        prompt = (
+            "Extract all PERSON names and their roles or titles in the document. "
+            "Use exact spans from the text and include start/end indices."
+        )
+        try:
+            result = lx.extract(
+                text_or_documents=full_text,
+                prompt_description=prompt,
+                model_id="gemini-2.5-flash",     # 可改为 pro / ollama 等
+            )
+        except Exception as e:
+            st.error(f"LangExtract 调用失败：{e}")
+            st.stop()
+        # --------- 显示结果 ---------
+        st.subheader("📊 提取结果")
+        if not result.extractions:
+            st.warning("未找到符合条件的实体。")
+        else:
+            for ext in result.extractions:
+                st.json(
+                    {
+                        "class": ext.extraction_class,
+                        "text": ext.extraction_text,
+                        "start": ext.begin_offset,
+                        "end": ext.end_offset,
+                        **(ext.attributes or {}),
+                    },
+                    expanded=False,
+                )
+            # 提供下载
+            json_bytes = json.dumps([ext.to_dict() for ext in result.extractions], ensure_ascii=False, indent=2).encode()
+            st.download_button(
+                "💾 下载 JSON",
+                data=json_bytes,
+                file_name="extractions.json",
+                mime="application/json",
+            )

apt.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+# OCRmyPDF 运行依赖
+tesseract-ocr
+tesseract-ocr-eng
+tesseract-ocr-chi-sim
+ghostscript

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
-altair
-pandas
-streamlit

+streamlit>=1.35
+ocrmypdf>=15.0
+PyMuPDF>=1.24
+langextract>=0.5.0