Spaces:

nanoapple
/

LangmyOCR

Sleeping

App Files Files Community

nanoapple commited on 14 days ago

Commit

1788c95

verified ·

1 Parent(s): 020da56

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -28

app.py CHANGED Viewed

@@ -98,63 +98,92 @@ def run_ocr(pdf_file, langs: str, rotate_pages: bool, deskew: bool, clean: bool,
 def run_extract(sidecar_text: str, provider: str, model_id: str, prompt: str):
     if not sidecar_text:
         return None, None, "没有可供抽取的文本。"
     if provider == "None":
         return None, None, "未选择模型，跳过抽取。"
     if provider == "Gemini":
         api_key = os.environ.get("LANGEXTRACT_API_KEY")
         if not api_key:
             return None, None, "未检测到 Gemini API Key（LANGEXTRACT_API_KEY）。"
-        fence_output = False
-        use_schema_constraints = False
     elif provider == "OpenAI":
         api_key = os.environ.get("OPENAI_API_KEY")
         if not api_key:
             return None, None, "未检测到 OpenAI API Key（OPENAI_API_KEY）。"
-        fence_output = True
-        use_schema_constraints = False
     else:
         return None, None, "未知的 provider。"
-    # 最小 few-shot 示例（演示用；生产请替换为你的法律 schema）
     examples = [
         lx.data.ExampleData(
-            text="On 12/03/2023, ABC Pty Ltd terminated Ms Wang's employment.",
             extractions=[
                 lx.data.Extraction(
                     extraction_class="party",
-                    extraction_text="ABC Pty Ltd",
-                    attributes={"role": "respondent"},
                 ),
                 lx.data.Extraction(
                     extraction_class="event",
-                    extraction_text="terminated",
-                    attributes={"date": "12/03/2023"},
                 ),
             ],
         )
     ]
     with st.status("正在进行结构化抽取 …", expanded=False) as s:
         try:
-            result = lx.extract(
-                text_or_documents=sidecar_text,
-                prompt_description=prompt.strip(),
-                examples=examples,
-                model_id=model_id.strip(),
-                api_key=api_key,
-                fence_output=fence_output,
-                use_schema_constraints=use_schema_constraints,
-            )
-        except Exception as e:
-            s.update(label="抽取失败", state="error")
-            return None, None, f"LangExtract 抽取失败：{e}"
-        # 保存 JSONL + 生成交互式 HTML
-        work = Path(tempfile.mkdtemp(prefix="lx_"))
-        jsonl_path = work / "extractions.jsonl"
-        html_path = work / "review.html"
         try:
             lx.io.save_annotated_documents([result], output_name=str(jsonl_path))
             html_content = lx.visualize(str(jsonl_path))

 def run_extract(sidecar_text: str, provider: str, model_id: str, prompt: str):
     if not sidecar_text:
         return None, None, "没有可供抽取的文本。"
     if provider == "None":
         return None, None, "未选择模型，跳过抽取。"
+    # 1) 读取 Key，并统一默认打开 fence_output
+    fence_output = True               # << 对 Gemini 也打开
+    use_schema_constraints = False    # 先不启 Schema（必要时再开）
     if provider == "Gemini":
         api_key = os.environ.get("LANGEXTRACT_API_KEY")
         if not api_key:
             return None, None, "未检测到 Gemini API Key（LANGEXTRACT_API_KEY）。"
     elif provider == "OpenAI":
         api_key = os.environ.get("OPENAI_API_KEY")
         if not api_key:
             return None, None, "未检测到 OpenAI API Key（OPENAI_API_KEY）。"
     else:
         return None, None, "未知的 provider。"
+    # 2) 收紧提示语（覆盖面向法律的 schema），严格要求“只返回 JSON 数组”
+    strict_prompt = (
+        "You are an information extraction engine. "
+        "Extract legal entities, events, relationships, and evidence anchors from the input text. "
+        "Return ONLY a JSON array, no prose, no markdown, no comments. "
+        "Schema per item: {"
+        "\"class\": one of [\"party\",\"event\",\"date\",\"relation\",\"evidence\"], "
+        "\"text\": string (exact span), "
+        "\"attributes\": object (key-value), "
+        "\"source_hint\": string (optional page/line) "
+        "}."
+    )
+    # 3) 精简可运行的 few-shot（与法律场景贴近）
     examples = [
         lx.data.ExampleData(
+            text="On 15 February 2022, Dr Gavin Soo completed a medicolegal report to Walker Law Group.",
             extractions=[
                 lx.data.Extraction(
                     extraction_class="party",
+                    extraction_text="Walker Law Group",
+                    attributes={"role": "law_firm"},
                 ),
                 lx.data.Extraction(
                     extraction_class="event",
+                    extraction_text="completed a medicolegal report",
+                    attributes={"actor": "Dr Gavin Soo"},
+                ),
+                lx.data.Extraction(
+                    extraction_class="date",
+                    extraction_text="15 February 2022",
+                    attributes={}
                 ),
             ],
         )
     ]
+    # 4) 先跑一次；若解析失败，再以更强硬提示重试一次
+    work = Path(tempfile.mkdtemp(prefix="lx_"))
+    jsonl_path = work / "extractions.jsonl"
+    html_path = work / "review.html"
+    raw_path1 = work / "raw_attempt1.txt"
+    raw_path2 = work / "raw_attempt2.txt"
+    def _try_extract(prompt_text):
+        # LangExtract 没有公开 raw 输出参数，我们用 try/except 捕获并让其保存在日志（同时缩短输入验证）
+        return lx.extract(
+            text_or_documents=sidecar_text[:15000],  # 先限长，避免超长触发安全策略
+            prompt_description=prompt_text.strip(),
+            examples=examples,
+            model_id=model_id.strip(),
+            api_key=api_key,
+            fence_output=fence_output,
+            use_schema_constraints=use_schema_constraints,
+        )
     with st.status("正在进行结构化抽取 …", expanded=False) as s:
         try:
+            result = _try_extract(strict_prompt)
+        except Exception as e1:
+            # 第一次失败：很可能是返回了非 JSON。我们把提示再加强，强调 “only JSON array”
+            hard_prompt = strict_prompt + " Output must be a compact JSON array. Do not include any other text."
+            try:
+                result = _try_extract(hard_prompt)
+            except Exception as e2:
+                s.update(label="抽取失败", state="error")
+                return None, None, f"LangExtract 抽取失败：{e2}"
+        # 保存结果并可视化
         try:
             lx.io.save_annotated_documents([result], output_name=str(jsonl_path))
             html_content = lx.visualize(str(jsonl_path))