Spaces:

nanoapple
/

LangmyOCR

Sleeping

App Files Files Community

nanoapple commited on 22 days ago

Commit

b3e29c3

verified ·

1 Parent(s): e4d3eeb

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -56

app.py CHANGED Viewed

@@ -3,11 +3,7 @@ import tempfile, os, subprocess, json, time
 import fitz  # PyMuPDF
 import langextract as lx
-# ------------------- 环境配置 -------------------
-API_KEY = os.getenv("LANGEXTRACT_API_KEY")
-if API_KEY:
-    lx.configure(api_key=API_KEY)
 st.set_page_config(page_title="OCR + LangExtract", page_icon="📄")
 st.title("📄 OCR + LangExtract Demo")
 st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
@@ -16,80 +12,66 @@ st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
 uploaded = st.file_uploader("选择 PDF 文件（≤ 5 页）", type=["pdf"])
 if uploaded:
     with tempfile.TemporaryDirectory() as tmpdir:
-        raw_path = os.path.join(tmpdir, "input.pdf")
-        ocr_path = os.path.join(tmpdir, "ocr.pdf")
-        with open(raw_path, "wb") as f:
             f.write(uploaded.read())
-        # --------- 页数检测 ---------
-        pages = fitz.open(raw_path).page_count
-        if pages > 5:
-            st.error(f"🚫 当前 {pages} 页，超出限制。请上传 ≤ 5 页的 PDF。")
             st.stop()
         # --------- OCR 处理 ---------
-        st.info("⚙️ 正在执行 OCR，请稍候 …")
-        t0 = time.time()
         try:
-            subprocess.run(
-                [
-                    "ocrmypdf",
-                    raw_path,
-                    ocr_path,
-                    "-l", "eng+chi_sim",     # 英文 + 简体中文
-                    "--skip-text",           # 跳过已有文字页
-                    "--quiet"
-                ],
-                check=True,
-            )
         except subprocess.CalledProcessError as e:
             st.error(f"OCR 失败：{e}")
             st.stop()
-        st.success(f"OCR 完成，用时 {time.time() - t0:.1f}s")
-        # --------- 提取全文 ---------
-        doc = fitz.open(ocr_path)
-        full_text = "\n".join(page.get_text() for page in doc)
-        # --------- LangExtract 结构化 ---------
-        st.info("🔍 LangExtract 正在解析 …")
-        prompt = (
-            "Extract all PERSON names and their roles or titles in the document. "
-            "Use exact spans from the text and include start/end indices."
-        )
         try:
             result = lx.extract(
                 text_or_documents=full_text,
-                prompt_description=prompt,
-                model_id="gemini-2.5-flash",     # 可改为 pro / ollama 等
             )
         except Exception as e:
-            st.error(f"LangExtract 调用失败：{e}")
             st.stop()
         # --------- 显示结果 ---------
         st.subheader("📊 提取结果")
         if not result.extractions:
-            st.warning("未找到符合条件的实体。")
         else:
             for ext in result.extractions:
-                st.json(
-                    {
-                        "class": ext.extraction_class,
-                        "text": ext.extraction_text,
-                        "start": ext.begin_offset,
-                        "end": ext.end_offset,
-                        **(ext.attributes or {}),
-                    },
-                    expanded=False,
-                )
-            # 提供下载
-            json_bytes = json.dumps([ext.to_dict() for ext in result.extractions], ensure_ascii=False, indent=2).encode()
             st.download_button(
-                "💾 下载 JSON",
-                data=json_bytes,
                 file_name="extractions.json",
-                mime="application/json",
-            )

 import fitz  # PyMuPDF
 import langextract as lx
+# ------------------- 页面配置 -------------------
 st.set_page_config(page_title="OCR + LangExtract", page_icon="📄")
 st.title("📄 OCR + LangExtract Demo")
 st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
 uploaded = st.file_uploader("选择 PDF 文件（≤ 5 页）", type=["pdf"])
 if uploaded:
     with tempfile.TemporaryDirectory() as tmpdir:
+        input_path = os.path.join(tmpdir, "input.pdf")
+        output_path = os.path.join(tmpdir, "ocr.pdf")
+        with open(input_path, "wb") as f:
             f.write(uploaded.read())
+        # --------- 检查页数 ---------
+        doc = fitz.open(input_path)
+        if doc.page_count > 5:
+            st.error("🚫 当前 PDF 超过 5 页。请上传不超过 5 页的文件。")
             st.stop()
         # --------- OCR 处理 ---------
+        st.info("🔍 正在进行 OCR，请稍候 …")
         try:
+            subprocess.run([
+                "ocrmypdf",
+                input_path,
+                output_path,
+                "-l", "eng+chi_sim",
+                "--skip-text",
+                "--quiet"
+            ], check=True)
         except subprocess.CalledProcessError as e:
             st.error(f"OCR 失败：{e}")
             st.stop()
+        # --------- 提取文本 ---------
+        st.success("✅ OCR 完成，正在提取文本并调用 LangExtract …")
+        ocr_doc = fitz.open(output_path)
+        full_text = "\n".join([page.get_text() for page in ocr_doc])
+        # --------- LangExtract 调用 ---------
         try:
             result = lx.extract(
                 text_or_documents=full_text,
+                prompt_description="Extract named people and organisations from the document.",
+                model_id="gemini-1.5-flash"
             )
         except Exception as e:
+            st.error(f"结构化提取失败：{e}")
             st.stop()
         # --------- 显示结果 ---------
         st.subheader("📊 提取结果")
         if not result.extractions:
+            st.warning("未提取到任何实体。")
         else:
             for ext in result.extractions:
+                st.json({
+                    "class": ext.extraction_class,
+                    "text": ext.extraction_text,
+                    "start": ext.begin_offset,
+                    "end": ext.end_offset,
+                    **(ext.attributes or {})
+                }, expanded=False)
+            # 下载按钮
             st.download_button(
+                "💾 下载提取结果 (JSON)",
+                data=json.dumps([e.to_dict() for e in result.extractions], ensure_ascii=False, indent=2),
                 file_name="extractions.json",
+                mime="application/json"
+            )