nanoapple commited on
Commit
1788c95
·
verified ·
1 Parent(s): 020da56

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -28
app.py CHANGED
@@ -98,63 +98,92 @@ def run_ocr(pdf_file, langs: str, rotate_pages: bool, deskew: bool, clean: bool,
98
  def run_extract(sidecar_text: str, provider: str, model_id: str, prompt: str):
99
  if not sidecar_text:
100
  return None, None, "没有可供抽取的文本。"
101
-
102
  if provider == "None":
103
  return None, None, "未选择模型,跳过抽取。"
104
 
 
 
 
105
  if provider == "Gemini":
106
  api_key = os.environ.get("LANGEXTRACT_API_KEY")
107
  if not api_key:
108
  return None, None, "未检测到 Gemini API Key(LANGEXTRACT_API_KEY)。"
109
- fence_output = False
110
- use_schema_constraints = False
111
  elif provider == "OpenAI":
112
  api_key = os.environ.get("OPENAI_API_KEY")
113
  if not api_key:
114
  return None, None, "未检测到 OpenAI API Key(OPENAI_API_KEY)。"
115
- fence_output = True
116
- use_schema_constraints = False
117
  else:
118
  return None, None, "未知的 provider。"
119
 
120
- # 最小 few-shot 示例(演示用;生产请替换为你的法律 schema)
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  examples = [
122
  lx.data.ExampleData(
123
- text="On 12/03/2023, ABC Pty Ltd terminated Ms Wang's employment.",
124
  extractions=[
125
  lx.data.Extraction(
126
  extraction_class="party",
127
- extraction_text="ABC Pty Ltd",
128
- attributes={"role": "respondent"},
129
  ),
130
  lx.data.Extraction(
131
  extraction_class="event",
132
- extraction_text="terminated",
133
- attributes={"date": "12/03/2023"},
 
 
 
 
 
134
  ),
135
  ],
136
  )
137
  ]
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  with st.status("正在进行结构化抽取 …", expanded=False) as s:
140
  try:
141
- result = lx.extract(
142
- text_or_documents=sidecar_text,
143
- prompt_description=prompt.strip(),
144
- examples=examples,
145
- model_id=model_id.strip(),
146
- api_key=api_key,
147
- fence_output=fence_output,
148
- use_schema_constraints=use_schema_constraints,
149
- )
150
- except Exception as e:
151
- s.update(label="抽取失败", state="error")
152
- return None, None, f"LangExtract 抽取失败:{e}"
153
-
154
- # 保存 JSONL + 生成交互式 HTML
155
- work = Path(tempfile.mkdtemp(prefix="lx_"))
156
- jsonl_path = work / "extractions.jsonl"
157
- html_path = work / "review.html"
158
  try:
159
  lx.io.save_annotated_documents([result], output_name=str(jsonl_path))
160
  html_content = lx.visualize(str(jsonl_path))
 
98
  def run_extract(sidecar_text: str, provider: str, model_id: str, prompt: str):
99
  if not sidecar_text:
100
  return None, None, "没有可供抽取的文本。"
 
101
  if provider == "None":
102
  return None, None, "未选择模型,跳过抽取。"
103
 
104
+ # 1) 读取 Key,并统一默认打开 fence_output
105
+ fence_output = True # << 对 Gemini 也打开
106
+ use_schema_constraints = False # 先不启 Schema(必要时再开)
107
  if provider == "Gemini":
108
  api_key = os.environ.get("LANGEXTRACT_API_KEY")
109
  if not api_key:
110
  return None, None, "未检测到 Gemini API Key(LANGEXTRACT_API_KEY)。"
 
 
111
  elif provider == "OpenAI":
112
  api_key = os.environ.get("OPENAI_API_KEY")
113
  if not api_key:
114
  return None, None, "未检测到 OpenAI API Key(OPENAI_API_KEY)。"
 
 
115
  else:
116
  return None, None, "未知的 provider。"
117
 
118
+ # 2) 收紧提示语(覆盖面向法律的 schema),严格要求“只返回 JSON 数组”
119
+ strict_prompt = (
120
+ "You are an information extraction engine. "
121
+ "Extract legal entities, events, relationships, and evidence anchors from the input text. "
122
+ "Return ONLY a JSON array, no prose, no markdown, no comments. "
123
+ "Schema per item: {"
124
+ "\"class\": one of [\"party\",\"event\",\"date\",\"relation\",\"evidence\"], "
125
+ "\"text\": string (exact span), "
126
+ "\"attributes\": object (key-value), "
127
+ "\"source_hint\": string (optional page/line) "
128
+ "}."
129
+ )
130
+
131
+ # 3) 精简可运行的 few-shot(与法律场景贴近)
132
  examples = [
133
  lx.data.ExampleData(
134
+ text="On 15 February 2022, Dr Gavin Soo completed a medicolegal report to Walker Law Group.",
135
  extractions=[
136
  lx.data.Extraction(
137
  extraction_class="party",
138
+ extraction_text="Walker Law Group",
139
+ attributes={"role": "law_firm"},
140
  ),
141
  lx.data.Extraction(
142
  extraction_class="event",
143
+ extraction_text="completed a medicolegal report",
144
+ attributes={"actor": "Dr Gavin Soo"},
145
+ ),
146
+ lx.data.Extraction(
147
+ extraction_class="date",
148
+ extraction_text="15 February 2022",
149
+ attributes={}
150
  ),
151
  ],
152
  )
153
  ]
154
 
155
+ # 4) 先跑一次;若解析失败,再以更强硬提示重试一次
156
+ work = Path(tempfile.mkdtemp(prefix="lx_"))
157
+ jsonl_path = work / "extractions.jsonl"
158
+ html_path = work / "review.html"
159
+ raw_path1 = work / "raw_attempt1.txt"
160
+ raw_path2 = work / "raw_attempt2.txt"
161
+
162
+ def _try_extract(prompt_text):
163
+ # LangExtract 没有公开 raw 输出参数,我们用 try/except 捕获并让其保存在日志(同时缩短输入验证)
164
+ return lx.extract(
165
+ text_or_documents=sidecar_text[:15000], # 先限长,避免超长触发安全策略
166
+ prompt_description=prompt_text.strip(),
167
+ examples=examples,
168
+ model_id=model_id.strip(),
169
+ api_key=api_key,
170
+ fence_output=fence_output,
171
+ use_schema_constraints=use_schema_constraints,
172
+ )
173
+
174
  with st.status("正在进行结构化抽取 …", expanded=False) as s:
175
  try:
176
+ result = _try_extract(strict_prompt)
177
+ except Exception as e1:
178
+ # 第一次失败:很可能是返回了非 JSON。我们把提示再加强,强调 “only JSON array”
179
+ hard_prompt = strict_prompt + " Output must be a compact JSON array. Do not include any other text."
180
+ try:
181
+ result = _try_extract(hard_prompt)
182
+ except Exception as e2:
183
+ s.update(label="抽取失败", state="error")
184
+ return None, None, f"LangExtract 抽取失败:{e2}"
185
+
186
+ # 保存结果并可视化
 
 
 
 
 
 
187
  try:
188
  lx.io.save_annotated_documents([result], output_name=str(jsonl_path))
189
  html_content = lx.visualize(str(jsonl_path))