nanoapple commited on
Commit
3546c19
·
verified ·
1 Parent(s): d42a887

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -86
app.py DELETED
@@ -1,86 +0,0 @@
1
- import streamlit as st
2
- import tempfile, os, subprocess, json, time
3
- import fitz # PyMuPDF
4
- import langextract as lx
5
-
6
- st.set_page_config(page_title="OCR + LangExtract", page_icon="📄")
7
- st.title("📄 OCR + LangExtract Demo")
8
- st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
9
-
10
- uploaded = st.file_uploader("选择 PDF 文件(≤ 10 页)", type=["pdf"])
11
-
12
- if uploaded:
13
- st.success("✅ 文件已上传,请稍候 …")
14
- with st.spinner("📄 正在进行 OCR 和结构化提取 …"):
15
- with tempfile.TemporaryDirectory() as tmpdir:
16
- input_path = os.path.join(tmpdir, "input.pdf")
17
- output_path = os.path.join(tmpdir, "ocr.pdf")
18
- with open(input_path, "wb") as f:
19
- f.write(uploaded.read())
20
-
21
- # 检查页数
22
- try:
23
- doc = fitz.open(input_path)
24
- if doc.page_count > 10:
25
- st.error("🚫 当前 PDF 超过 10 页。请上传不超过 5 页的文件。")
26
- st.stop()
27
- except Exception as e:
28
- st.error(f"无法打开 PDF 文件:{e}")
29
- st.stop()
30
-
31
- # OCR 处理
32
- try:
33
- subprocess.run([
34
- "ocrmypdf",
35
- input_path,
36
- output_path,
37
- "-l", "eng+chi_sim",
38
- "--skip-text",
39
- "--quiet"
40
- ], check=True)
41
- st.success("✅ OCR 完成")
42
- except subprocess.CalledProcessError as e:
43
- st.error(f"OCR 失败:{e}")
44
- st.stop()
45
-
46
- # 提取文本
47
- try:
48
- ocr_doc = fitz.open(output_path)
49
- full_text = "\n".join([page.get_text() for page in ocr_doc])
50
- except Exception as e:
51
- st.error(f"提取 OCR 文本失败:{e}")
52
- st.stop()
53
-
54
- # LangExtract 调用
55
- try:
56
- result = lx.extract(
57
- text_or_documents=full_text,
58
- prompt_description="Extract named people and organisations from the document.",
59
- model_id="gemini-1.5-flash"
60
- )
61
- st.success("✅ 结构化提取完成")
62
- except Exception as e:
63
- st.error(f"结构化提取失败:{e}")
64
- st.stop()
65
-
66
- # 显示结果
67
- st.subheader("📊 提取结果")
68
- if not result.extractions:
69
- st.warning("未提取到任何实体。")
70
- else:
71
- for ext in result.extractions:
72
- st.json({
73
- "class": ext.extraction_class,
74
- "text": ext.extraction_text,
75
- "start": ext.begin_offset,
76
- "end": ext.end_offset,
77
- **(ext.attributes or {})
78
- }, expanded=False)
79
-
80
- # 下载按钮
81
- st.download_button(
82
- "💾 下载提取结果 (JSON)",
83
- data=json.dumps([e.to_dict() for e in result.extractions], ensure_ascii=False, indent=2),
84
- file_name="extractions.json",
85
- mime="application/json"
86
- )