nanoapple commited on
Commit
07542f7
·
verified ·
1 Parent(s): 2492dc8

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +13 -18
  2. app.py +95 -0
  3. apt.txt +5 -0
  4. requirements.txt +4 -3
README.md CHANGED
@@ -1,20 +1,15 @@
1
- ---
2
- title: LangmyOCR
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: Processes up-to-five-page PDFs by adding searchable OCR text
12
- license: mit
13
- ---
14
 
15
- # Welcome to Streamlit!
 
 
 
 
16
 
17
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
-
19
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
- forums](https://discuss.streamlit.io).
 
 
 
 
1
+ # 📄 OCR + LangExtract Web App (Hugging Face Spaces)
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ 一个极简示例:
4
+ 1. 用户上传 ≤ 5 页的 PDF(扫描或数字文档皆可);
5
+ 2. 调用 **OCRmyPDF** 为扫描页添加不可见文字层;
6
+ 3. 从全文中提取结构化信息(借助 [LangExtract](https://github.com/google/langextract));
7
+ 4. 将提取结果在界面中展示,并可下载 JSON。
8
 
9
+ > **必备**
10
+ > • 在 **Spaces → Settings → Secrets** 里添加 `LANGEXTRACT_API_KEY`(如果用 Gemini API)。
11
+ > 资源免费层仅供演示,上传文件 5 MB 5 页以控制内存和执行时间。
12
+ >
13
+ > 如需扩展:
14
+ > • 提示词、示例、模型 ID 都可在 `app.py` 里调整;
15
+ > • 可接入 Postgres / S3 存储历史记录,或增加登录鉴权。
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile, os, subprocess, json, time
3
+ import fitz # PyMuPDF
4
+ import langextract as lx
5
+
6
+ # ------------------- 环境配置 -------------------
7
+ API_KEY = os.getenv("LANGEXTRACT_API_KEY")
8
+ if API_KEY:
9
+ lx.configure(api_key=API_KEY)
10
+
11
+ st.set_page_config(page_title="OCR + LangExtract", page_icon="📄")
12
+ st.title("📄 OCR + LangExtract Demo")
13
+ st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
14
+
15
+ # ------------------- 文件上传 -------------------
16
+ uploaded = st.file_uploader("选择 PDF 文件(≤ 5 页)", type=["pdf"])
17
+ if uploaded:
18
+ with tempfile.TemporaryDirectory() as tmpdir:
19
+ raw_path = os.path.join(tmpdir, "input.pdf")
20
+ ocr_path = os.path.join(tmpdir, "ocr.pdf")
21
+ with open(raw_path, "wb") as f:
22
+ f.write(uploaded.read())
23
+
24
+ # --------- 页数检测 ---------
25
+ pages = fitz.open(raw_path).page_count
26
+ if pages > 5:
27
+ st.error(f"🚫 当前 {pages} 页,超出限制。请上传 ≤ 5 页的 PDF。")
28
+ st.stop()
29
+
30
+ # --------- OCR 处理 ---------
31
+ st.info("⚙️ 正在执行 OCR,请稍候 …")
32
+ t0 = time.time()
33
+ try:
34
+ subprocess.run(
35
+ [
36
+ "ocrmypdf",
37
+ raw_path,
38
+ ocr_path,
39
+ "-l", "eng+chi_sim", # 英文 + 简体中文
40
+ "--skip-text", # 跳过已有文字页
41
+ "--quiet"
42
+ ],
43
+ check=True,
44
+ )
45
+ except subprocess.CalledProcessError as e:
46
+ st.error(f"OCR 失败:{e}")
47
+ st.stop()
48
+ st.success(f"OCR 完成,用时 {time.time() - t0:.1f}s")
49
+
50
+ # --------- 提取全文 ---------
51
+ doc = fitz.open(ocr_path)
52
+ full_text = "\n".join(page.get_text() for page in doc)
53
+
54
+ # --------- LangExtract 结构化 ---------
55
+ st.info("🔍 LangExtract 正在解析 …")
56
+ prompt = (
57
+ "Extract all PERSON names and their roles or titles in the document. "
58
+ "Use exact spans from the text and include start/end indices."
59
+ )
60
+
61
+ try:
62
+ result = lx.extract(
63
+ text_or_documents=full_text,
64
+ prompt_description=prompt,
65
+ model_id="gemini-2.5-flash", # 可改为 pro / ollama 等
66
+ )
67
+ except Exception as e:
68
+ st.error(f"LangExtract 调用失败:{e}")
69
+ st.stop()
70
+
71
+ # --------- 显示结果 ---------
72
+ st.subheader("📊 提取结果")
73
+ if not result.extractions:
74
+ st.warning("未找到符合条件的实体。")
75
+ else:
76
+ for ext in result.extractions:
77
+ st.json(
78
+ {
79
+ "class": ext.extraction_class,
80
+ "text": ext.extraction_text,
81
+ "start": ext.begin_offset,
82
+ "end": ext.end_offset,
83
+ **(ext.attributes or {}),
84
+ },
85
+ expanded=False,
86
+ )
87
+
88
+ # 提供下载
89
+ json_bytes = json.dumps([ext.to_dict() for ext in result.extractions], ensure_ascii=False, indent=2).encode()
90
+ st.download_button(
91
+ "💾 下载 JSON",
92
+ data=json_bytes,
93
+ file_name="extractions.json",
94
+ mime="application/json",
95
+ )
apt.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # OCRmyPDF 运行依赖
2
+ tesseract-ocr
3
+ tesseract-ocr-eng
4
+ tesseract-ocr-chi-sim
5
+ ghostscript
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
- altair
2
- pandas
3
- streamlit
 
 
1
+ streamlit>=1.35
2
+ ocrmypdf>=15.0
3
+ PyMuPDF>=1.24
4
+ langextract>=0.5.0