Spaces:

nanoapple
/

LangmyOCR

Sleeping

App Files Files Community

nanoapple commited on 20 days ago

Commit

3546c19

verified ·

1 Parent(s): d42a887

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -86

app.py DELETED Viewed

@@ -1,86 +0,0 @@
-import streamlit as st
-import tempfile, os, subprocess, json, time
-import fitz  # PyMuPDF
-import langextract as lx
-st.set_page_config(page_title="OCR + LangExtract", page_icon="📄")
-st.title("📄 OCR + LangExtract Demo")
-st.caption("上传 ≤ 5 页 PDF → OCR → 结构化提取")
-uploaded = st.file_uploader("选择 PDF 文件（≤ 10 页）", type=["pdf"])
-if uploaded:
-    st.success("✅ 文件已上传，请稍候 …")
-    with st.spinner("📄 正在进行 OCR 和结构化提取 …"):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            input_path = os.path.join(tmpdir, "input.pdf")
-            output_path = os.path.join(tmpdir, "ocr.pdf")
-            with open(input_path, "wb") as f:
-                f.write(uploaded.read())
-            # 检查页数
-            try:
-                doc = fitz.open(input_path)
-                if doc.page_count > 10:
-                    st.error("🚫 当前 PDF 超过 10 页。请上传不超过 5 页的文件。")
-                    st.stop()
-            except Exception as e:
-                st.error(f"无法打开 PDF 文件：{e}")
-                st.stop()
-            # OCR 处理
-            try:
-                subprocess.run([
-                    "ocrmypdf",
-                    input_path,
-                    output_path,
-                    "-l", "eng+chi_sim",
-                    "--skip-text",
-                    "--quiet"
-                ], check=True)
-                st.success("✅ OCR 完成")
-            except subprocess.CalledProcessError as e:
-                st.error(f"OCR 失败：{e}")
-                st.stop()
-            # 提取文本
-            try:
-                ocr_doc = fitz.open(output_path)
-                full_text = "\n".join([page.get_text() for page in ocr_doc])
-            except Exception as e:
-                st.error(f"提取 OCR 文本失败：{e}")
-                st.stop()
-            # LangExtract 调用
-            try:
-                result = lx.extract(
-                    text_or_documents=full_text,
-                    prompt_description="Extract named people and organisations from the document.",
-                    model_id="gemini-1.5-flash"
-                )
-                st.success("✅ 结构化提取完成")
-            except Exception as e:
-                st.error(f"结构化提取失败：{e}")
-                st.stop()
-            # 显示结果
-            st.subheader("📊 提取结果")
-            if not result.extractions:
-                st.warning("未提取到任何实体。")
-            else:
-                for ext in result.extractions:
-                    st.json({
-                        "class": ext.extraction_class,
-                        "text": ext.extraction_text,
-                        "start": ext.begin_offset,
-                        "end": ext.end_offset,
-                        **(ext.attributes or {})
-                    }, expanded=False)
-                # 下载按钮
-                st.download_button(
-                    "💾 下载提取结果 (JSON)",
-                    data=json.dumps([e.to_dict() for e in result.extractions], ensure_ascii=False, indent=2),
-                    file_name="extractions.json",
-                    mime="application/json"
-                )