nanoapple commited on
Commit
b3e29c3
·
verified ·
1 Parent(s): e4d3eeb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -56
app.py CHANGED
@@ -3,11 +3,7 @@ import tempfile, os, subprocess, json, time
3
  import fitz # PyMuPDF
4
  import langextract as lx
5
 
6
- # ------------------- 环境配置 -------------------
7
- API_KEY = os.getenv("LANGEXTRACT_API_KEY")
8
- if API_KEY:
9
- lx.configure(api_key=API_KEY)
10
-
11
  st.set_page_config(page_title="OCR + LangExtract", page_icon="📄")
12
  st.title("📄 OCR + LangExtract Demo")
13
  st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
@@ -16,80 +12,66 @@ st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
16
  uploaded = st.file_uploader("选择 PDF 文件(≤ 5 页)", type=["pdf"])
17
  if uploaded:
18
  with tempfile.TemporaryDirectory() as tmpdir:
19
- raw_path = os.path.join(tmpdir, "input.pdf")
20
- ocr_path = os.path.join(tmpdir, "ocr.pdf")
21
- with open(raw_path, "wb") as f:
22
  f.write(uploaded.read())
23
 
24
- # --------- 页数检测 ---------
25
- pages = fitz.open(raw_path).page_count
26
- if pages > 5:
27
- st.error(f"🚫 当前 {pages} 页,超出限制。请上传 5 页的 PDF。")
28
  st.stop()
29
 
30
  # --------- OCR 处理 ---------
31
- st.info("⚙️ 正在执行 OCR,请稍候 …")
32
- t0 = time.time()
33
  try:
34
- subprocess.run(
35
- [
36
- "ocrmypdf",
37
- raw_path,
38
- ocr_path,
39
- "-l", "eng+chi_sim", # 英文 + 简体中文
40
- "--skip-text", # 跳过已有文字页
41
- "--quiet"
42
- ],
43
- check=True,
44
- )
45
  except subprocess.CalledProcessError as e:
46
  st.error(f"OCR 失败:{e}")
47
  st.stop()
48
- st.success(f"OCR 完成,用时 {time.time() - t0:.1f}s")
49
 
50
- # --------- 提取全文 ---------
51
- doc = fitz.open(ocr_path)
52
- full_text = "\n".join(page.get_text() for page in doc)
53
-
54
- # --------- LangExtract 结构化 ---------
55
- st.info("🔍 LangExtract 正在解析 …")
56
- prompt = (
57
- "Extract all PERSON names and their roles or titles in the document. "
58
- "Use exact spans from the text and include start/end indices."
59
- )
60
 
 
61
  try:
62
  result = lx.extract(
63
  text_or_documents=full_text,
64
- prompt_description=prompt,
65
- model_id="gemini-2.5-flash", # 可改为 pro / ollama 等
66
  )
67
  except Exception as e:
68
- st.error(f"LangExtract 调用失败:{e}")
69
  st.stop()
70
 
71
  # --------- 显示结果 ---------
72
  st.subheader("📊 提取结果")
73
  if not result.extractions:
74
- st.warning("未找到符合条件的实体。")
75
  else:
76
  for ext in result.extractions:
77
- st.json(
78
- {
79
- "class": ext.extraction_class,
80
- "text": ext.extraction_text,
81
- "start": ext.begin_offset,
82
- "end": ext.end_offset,
83
- **(ext.attributes or {}),
84
- },
85
- expanded=False,
86
- )
87
 
88
- # 提供下载
89
- json_bytes = json.dumps([ext.to_dict() for ext in result.extractions], ensure_ascii=False, indent=2).encode()
90
  st.download_button(
91
- "💾 下载 JSON",
92
- data=json_bytes,
93
  file_name="extractions.json",
94
- mime="application/json",
95
- )
 
3
  import fitz # PyMuPDF
4
  import langextract as lx
5
 
6
+ # ------------------- 页面配置 -------------------
 
 
 
 
7
  st.set_page_config(page_title="OCR + LangExtract", page_icon="📄")
8
  st.title("📄 OCR + LangExtract Demo")
9
  st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
 
12
  uploaded = st.file_uploader("选择 PDF 文件(≤ 5 页)", type=["pdf"])
13
  if uploaded:
14
  with tempfile.TemporaryDirectory() as tmpdir:
15
+ input_path = os.path.join(tmpdir, "input.pdf")
16
+ output_path = os.path.join(tmpdir, "ocr.pdf")
17
+ with open(input_path, "wb") as f:
18
  f.write(uploaded.read())
19
 
20
+ # --------- 检查页数 ---------
21
+ doc = fitz.open(input_path)
22
+ if doc.page_count > 5:
23
+ st.error("🚫 当前 PDF 超过 5 页。请上传不超过 5 页的文件。")
24
  st.stop()
25
 
26
  # --------- OCR 处理 ---------
27
+ st.info("🔍 正在进行 OCR,请稍候 …")
 
28
  try:
29
+ subprocess.run([
30
+ "ocrmypdf",
31
+ input_path,
32
+ output_path,
33
+ "-l", "eng+chi_sim",
34
+ "--skip-text",
35
+ "--quiet"
36
+ ], check=True)
 
 
 
37
  except subprocess.CalledProcessError as e:
38
  st.error(f"OCR 失败:{e}")
39
  st.stop()
 
40
 
41
+ # --------- 提取文本 ---------
42
+ st.success("✅ OCR 完成,正在提取文本并调用 LangExtract …")
43
+ ocr_doc = fitz.open(output_path)
44
+ full_text = "\n".join([page.get_text() for page in ocr_doc])
 
 
 
 
 
 
45
 
46
+ # --------- LangExtract 调用 ---------
47
  try:
48
  result = lx.extract(
49
  text_or_documents=full_text,
50
+ prompt_description="Extract named people and organisations from the document.",
51
+ model_id="gemini-1.5-flash"
52
  )
53
  except Exception as e:
54
+ st.error(f"结构化提取失败:{e}")
55
  st.stop()
56
 
57
  # --------- 显示结果 ---------
58
  st.subheader("📊 提取结果")
59
  if not result.extractions:
60
+ st.warning("未提取到任何实体。")
61
  else:
62
  for ext in result.extractions:
63
+ st.json({
64
+ "class": ext.extraction_class,
65
+ "text": ext.extraction_text,
66
+ "start": ext.begin_offset,
67
+ "end": ext.end_offset,
68
+ **(ext.attributes or {})
69
+ }, expanded=False)
 
 
 
70
 
71
+ # 下载按钮
 
72
  st.download_button(
73
+ "💾 下载提取结果 (JSON)",
74
+ data=json.dumps([e.to_dict() for e in result.extractions], ensure_ascii=False, indent=2),
75
  file_name="extractions.json",
76
+ mime="application/json"
77
+ )