Upload 4 files
Browse files
README.md
CHANGED
@@ -1,20 +1,15 @@
|
|
1 |
-
|
2 |
-
title: LangmyOCR
|
3 |
-
emoji: 🚀
|
4 |
-
colorFrom: red
|
5 |
-
colorTo: red
|
6 |
-
sdk: docker
|
7 |
-
app_port: 8501
|
8 |
-
tags:
|
9 |
-
- streamlit
|
10 |
-
pinned: false
|
11 |
-
short_description: Processes up-to-five-page PDFs by adding searchable OCR text
|
12 |
-
license: mit
|
13 |
-
---
|
14 |
|
15 |
-
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
1 |
+
# 📄 OCR + LangExtract Web App (Hugging Face Spaces)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
一个极简示例:
|
4 |
+
1. 用户上传 ≤ 5 页的 PDF(扫描或数字文档皆可);
|
5 |
+
2. 调用 **OCRmyPDF** 为扫描页添加不可见文字层;
|
6 |
+
3. 从全文中提取结构化信息(借助 [LangExtract](https://github.com/google/langextract));
|
7 |
+
4. 将提取结果在界面中展示,并可下载 JSON。
|
8 |
|
9 |
+
> **必备**
|
10 |
+
> • 在 **Spaces → Settings → Secrets** 里添加 `LANGEXTRACT_API_KEY`(如果用 Gemini API)。
|
11 |
+
> • 资源免费层仅供演示,上传文件 ≤ 5 MB 且 ≤ 5 页以控制内存和执行时间。
|
12 |
+
>
|
13 |
+
> 如需扩展:
|
14 |
+
> • 提示词、示例、模型 ID 都可在 `app.py` 里调整;
|
15 |
+
> • 可接入 Postgres / S3 存储历史记录,或增加登录鉴权。
|
app.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import tempfile, os, subprocess, json, time
|
3 |
+
import fitz # PyMuPDF
|
4 |
+
import langextract as lx
|
5 |
+
|
6 |
+
# ------------------- 环境配置 -------------------
|
7 |
+
API_KEY = os.getenv("LANGEXTRACT_API_KEY")
|
8 |
+
if API_KEY:
|
9 |
+
lx.configure(api_key=API_KEY)
|
10 |
+
|
11 |
+
st.set_page_config(page_title="OCR + LangExtract", page_icon="📄")
|
12 |
+
st.title("📄 OCR + LangExtract Demo")
|
13 |
+
st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
|
14 |
+
|
15 |
+
# ------------------- 文件上传 -------------------
|
16 |
+
uploaded = st.file_uploader("选择 PDF 文件(≤ 5 页)", type=["pdf"])
|
17 |
+
if uploaded:
|
18 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
19 |
+
raw_path = os.path.join(tmpdir, "input.pdf")
|
20 |
+
ocr_path = os.path.join(tmpdir, "ocr.pdf")
|
21 |
+
with open(raw_path, "wb") as f:
|
22 |
+
f.write(uploaded.read())
|
23 |
+
|
24 |
+
# --------- 页数检测 ---------
|
25 |
+
pages = fitz.open(raw_path).page_count
|
26 |
+
if pages > 5:
|
27 |
+
st.error(f"🚫 当前 {pages} 页,超出限制。请上传 ≤ 5 页的 PDF。")
|
28 |
+
st.stop()
|
29 |
+
|
30 |
+
# --------- OCR 处理 ---------
|
31 |
+
st.info("⚙️ 正在执行 OCR,请稍候 …")
|
32 |
+
t0 = time.time()
|
33 |
+
try:
|
34 |
+
subprocess.run(
|
35 |
+
[
|
36 |
+
"ocrmypdf",
|
37 |
+
raw_path,
|
38 |
+
ocr_path,
|
39 |
+
"-l", "eng+chi_sim", # 英文 + 简体中文
|
40 |
+
"--skip-text", # 跳过已有文字页
|
41 |
+
"--quiet"
|
42 |
+
],
|
43 |
+
check=True,
|
44 |
+
)
|
45 |
+
except subprocess.CalledProcessError as e:
|
46 |
+
st.error(f"OCR 失败:{e}")
|
47 |
+
st.stop()
|
48 |
+
st.success(f"OCR 完成,用时 {time.time() - t0:.1f}s")
|
49 |
+
|
50 |
+
# --------- 提取全文 ---------
|
51 |
+
doc = fitz.open(ocr_path)
|
52 |
+
full_text = "\n".join(page.get_text() for page in doc)
|
53 |
+
|
54 |
+
# --------- LangExtract 结构化 ---------
|
55 |
+
st.info("🔍 LangExtract 正在解析 …")
|
56 |
+
prompt = (
|
57 |
+
"Extract all PERSON names and their roles or titles in the document. "
|
58 |
+
"Use exact spans from the text and include start/end indices."
|
59 |
+
)
|
60 |
+
|
61 |
+
try:
|
62 |
+
result = lx.extract(
|
63 |
+
text_or_documents=full_text,
|
64 |
+
prompt_description=prompt,
|
65 |
+
model_id="gemini-2.5-flash", # 可改为 pro / ollama 等
|
66 |
+
)
|
67 |
+
except Exception as e:
|
68 |
+
st.error(f"LangExtract 调用失败:{e}")
|
69 |
+
st.stop()
|
70 |
+
|
71 |
+
# --------- 显示结果 ---------
|
72 |
+
st.subheader("📊 提取结果")
|
73 |
+
if not result.extractions:
|
74 |
+
st.warning("未找到符合条件的实体。")
|
75 |
+
else:
|
76 |
+
for ext in result.extractions:
|
77 |
+
st.json(
|
78 |
+
{
|
79 |
+
"class": ext.extraction_class,
|
80 |
+
"text": ext.extraction_text,
|
81 |
+
"start": ext.begin_offset,
|
82 |
+
"end": ext.end_offset,
|
83 |
+
**(ext.attributes or {}),
|
84 |
+
},
|
85 |
+
expanded=False,
|
86 |
+
)
|
87 |
+
|
88 |
+
# 提供下载
|
89 |
+
json_bytes = json.dumps([ext.to_dict() for ext in result.extractions], ensure_ascii=False, indent=2).encode()
|
90 |
+
st.download_button(
|
91 |
+
"💾 下载 JSON",
|
92 |
+
data=json_bytes,
|
93 |
+
file_name="extractions.json",
|
94 |
+
mime="application/json",
|
95 |
+
)
|
apt.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# OCRmyPDF 运行依赖
|
2 |
+
tesseract-ocr
|
3 |
+
tesseract-ocr-eng
|
4 |
+
tesseract-ocr-chi-sim
|
5 |
+
ghostscript
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
1 |
+
streamlit>=1.35
|
2 |
+
ocrmypdf>=15.0
|
3 |
+
PyMuPDF>=1.24
|
4 |
+
langextract>=0.5.0
|