nanoapple commited on
Commit
d42a887
·
verified ·
1 Parent(s): b3e29c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -61
app.py CHANGED
@@ -3,75 +3,84 @@ import tempfile, os, subprocess, json, time
3
  import fitz # PyMuPDF
4
  import langextract as lx
5
 
6
- # ------------------- 页面配置 -------------------
7
  st.set_page_config(page_title="OCR + LangExtract", page_icon="📄")
8
  st.title("📄 OCR + LangExtract Demo")
9
  st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
10
 
11
- # ------------------- 文件上传 -------------------
12
- uploaded = st.file_uploader("选择 PDF 文件(≤ 5 页)", type=["pdf"])
13
  if uploaded:
14
- with tempfile.TemporaryDirectory() as tmpdir:
15
- input_path = os.path.join(tmpdir, "input.pdf")
16
- output_path = os.path.join(tmpdir, "ocr.pdf")
17
- with open(input_path, "wb") as f:
18
- f.write(uploaded.read())
 
 
19
 
20
- # --------- 检查页数 ---------
21
- doc = fitz.open(input_path)
22
- if doc.page_count > 5:
23
- st.error("🚫 当前 PDF 超过 5 页。请上传不超过 5 页的文件。")
24
- st.stop()
 
 
 
 
25
 
26
- # --------- OCR 处理 ---------
27
- st.info("🔍 正在进行 OCR,请稍候 …")
28
- try:
29
- subprocess.run([
30
- "ocrmypdf",
31
- input_path,
32
- output_path,
33
- "-l", "eng+chi_sim",
34
- "--skip-text",
35
- "--quiet"
36
- ], check=True)
37
- except subprocess.CalledProcessError as e:
38
- st.error(f"OCR 失败:{e}")
39
- st.stop()
40
 
41
- # --------- 提取文本 ---------
42
- st.success("✅ OCR 完成,正在提取文本并调用 LangExtract …")
43
- ocr_doc = fitz.open(output_path)
44
- full_text = "\n".join([page.get_text() for page in ocr_doc])
 
 
 
45
 
46
- # --------- LangExtract 调用 ---------
47
- try:
48
- result = lx.extract(
49
- text_or_documents=full_text,
50
- prompt_description="Extract named people and organisations from the document.",
51
- model_id="gemini-1.5-flash"
52
- )
53
- except Exception as e:
54
- st.error(f"结构化提取失败:{e}")
55
- st.stop()
 
56
 
57
- # --------- 显示结果 ---------
58
- st.subheader("📊 提取结果")
59
- if not result.extractions:
60
- st.warning("未提取到任何实体。")
61
- else:
62
- for ext in result.extractions:
63
- st.json({
64
- "class": ext.extraction_class,
65
- "text": ext.extraction_text,
66
- "start": ext.begin_offset,
67
- "end": ext.end_offset,
68
- **(ext.attributes or {})
69
- }, expanded=False)
70
 
71
- # 下载按钮
72
- st.download_button(
73
- "💾 下载提取结果 (JSON)",
74
- data=json.dumps([e.to_dict() for e in result.extractions], ensure_ascii=False, indent=2),
75
- file_name="extractions.json",
76
- mime="application/json"
77
- )
 
3
  import fitz # PyMuPDF
4
  import langextract as lx
5
 
 
6
  st.set_page_config(page_title="OCR + LangExtract", page_icon="📄")
7
  st.title("📄 OCR + LangExtract Demo")
8
  st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
9
 
10
+ uploaded = st.file_uploader("选择 PDF 文件(≤ 10 页)", type=["pdf"])
11
+
12
  if uploaded:
13
+ st.success("✅ 文件已上传,请稍候 …")
14
+ with st.spinner("📄 正在进行 OCR 和结构化提取 …"):
15
+ with tempfile.TemporaryDirectory() as tmpdir:
16
+ input_path = os.path.join(tmpdir, "input.pdf")
17
+ output_path = os.path.join(tmpdir, "ocr.pdf")
18
+ with open(input_path, "wb") as f:
19
+ f.write(uploaded.read())
20
 
21
+ # 检查页数
22
+ try:
23
+ doc = fitz.open(input_path)
24
+ if doc.page_count > 10:
25
+ st.error("🚫 当前 PDF 超过 10 页。请上传不超过 5 页的文件。")
26
+ st.stop()
27
+ except Exception as e:
28
+ st.error(f"无法打开 PDF 文件:{e}")
29
+ st.stop()
30
 
31
+ # OCR 处理
32
+ try:
33
+ subprocess.run([
34
+ "ocrmypdf",
35
+ input_path,
36
+ output_path,
37
+ "-l", "eng+chi_sim",
38
+ "--skip-text",
39
+ "--quiet"
40
+ ], check=True)
41
+ st.success("✅ OCR 完成")
42
+ except subprocess.CalledProcessError as e:
43
+ st.error(f"OCR 失败:{e}")
44
+ st.stop()
45
 
46
+ # 提取文本
47
+ try:
48
+ ocr_doc = fitz.open(output_path)
49
+ full_text = "\n".join([page.get_text() for page in ocr_doc])
50
+ except Exception as e:
51
+ st.error(f"提取 OCR 文本失败:{e}")
52
+ st.stop()
53
 
54
+ # LangExtract 调用
55
+ try:
56
+ result = lx.extract(
57
+ text_or_documents=full_text,
58
+ prompt_description="Extract named people and organisations from the document.",
59
+ model_id="gemini-1.5-flash"
60
+ )
61
+ st.success("✅ 结构化提取完成")
62
+ except Exception as e:
63
+ st.error(f"结构化提取失败:{e}")
64
+ st.stop()
65
 
66
+ # 显示结果
67
+ st.subheader("📊 提取结果")
68
+ if not result.extractions:
69
+ st.warning("未提取到任何实体。")
70
+ else:
71
+ for ext in result.extractions:
72
+ st.json({
73
+ "class": ext.extraction_class,
74
+ "text": ext.extraction_text,
75
+ "start": ext.begin_offset,
76
+ "end": ext.end_offset,
77
+ **(ext.attributes or {})
78
+ }, expanded=False)
79
 
80
+ # 下载按钮
81
+ st.download_button(
82
+ "💾 下载提取结果 (JSON)",
83
+ data=json.dumps([e.to_dict() for e in result.extractions], ensure_ascii=False, indent=2),
84
+ file_name="extractions.json",
85
+ mime="application/json"
86
+ )