Update app.py
Browse files
app.py
CHANGED
@@ -98,63 +98,92 @@ def run_ocr(pdf_file, langs: str, rotate_pages: bool, deskew: bool, clean: bool,
|
|
98 |
def run_extract(sidecar_text: str, provider: str, model_id: str, prompt: str):
|
99 |
if not sidecar_text:
|
100 |
return None, None, "没有可供抽取的文本。"
|
101 |
-
|
102 |
if provider == "None":
|
103 |
return None, None, "未选择模型,跳过抽取。"
|
104 |
|
|
|
|
|
|
|
105 |
if provider == "Gemini":
|
106 |
api_key = os.environ.get("LANGEXTRACT_API_KEY")
|
107 |
if not api_key:
|
108 |
return None, None, "未检测到 Gemini API Key(LANGEXTRACT_API_KEY)。"
|
109 |
-
fence_output = False
|
110 |
-
use_schema_constraints = False
|
111 |
elif provider == "OpenAI":
|
112 |
api_key = os.environ.get("OPENAI_API_KEY")
|
113 |
if not api_key:
|
114 |
return None, None, "未检测到 OpenAI API Key(OPENAI_API_KEY)。"
|
115 |
-
fence_output = True
|
116 |
-
use_schema_constraints = False
|
117 |
else:
|
118 |
return None, None, "未知的 provider。"
|
119 |
|
120 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
examples = [
|
122 |
lx.data.ExampleData(
|
123 |
-
text="On
|
124 |
extractions=[
|
125 |
lx.data.Extraction(
|
126 |
extraction_class="party",
|
127 |
-
extraction_text="
|
128 |
-
attributes={"role": "
|
129 |
),
|
130 |
lx.data.Extraction(
|
131 |
extraction_class="event",
|
132 |
-
extraction_text="
|
133 |
-
attributes={"
|
|
|
|
|
|
|
|
|
|
|
134 |
),
|
135 |
],
|
136 |
)
|
137 |
]
|
138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
with st.status("正在进行结构化抽取 …", expanded=False) as s:
|
140 |
try:
|
141 |
-
result =
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
return None, None, f"LangExtract 抽取失败:{e}"
|
153 |
-
|
154 |
-
# 保存 JSONL + 生成交互式 HTML
|
155 |
-
work = Path(tempfile.mkdtemp(prefix="lx_"))
|
156 |
-
jsonl_path = work / "extractions.jsonl"
|
157 |
-
html_path = work / "review.html"
|
158 |
try:
|
159 |
lx.io.save_annotated_documents([result], output_name=str(jsonl_path))
|
160 |
html_content = lx.visualize(str(jsonl_path))
|
|
|
98 |
def run_extract(sidecar_text: str, provider: str, model_id: str, prompt: str):
|
99 |
if not sidecar_text:
|
100 |
return None, None, "没有可供抽取的文本。"
|
|
|
101 |
if provider == "None":
|
102 |
return None, None, "未选择模型,跳过抽取。"
|
103 |
|
104 |
+
# 1) 读取 Key,并统一默认打开 fence_output
|
105 |
+
fence_output = True # << 对 Gemini 也打开
|
106 |
+
use_schema_constraints = False # 先不启 Schema(必要时再开)
|
107 |
if provider == "Gemini":
|
108 |
api_key = os.environ.get("LANGEXTRACT_API_KEY")
|
109 |
if not api_key:
|
110 |
return None, None, "未检测到 Gemini API Key(LANGEXTRACT_API_KEY)。"
|
|
|
|
|
111 |
elif provider == "OpenAI":
|
112 |
api_key = os.environ.get("OPENAI_API_KEY")
|
113 |
if not api_key:
|
114 |
return None, None, "未检测到 OpenAI API Key(OPENAI_API_KEY)。"
|
|
|
|
|
115 |
else:
|
116 |
return None, None, "未知的 provider。"
|
117 |
|
118 |
+
# 2) 收紧提示语(覆盖面向法律的 schema),严格要求“只返回 JSON 数组”
|
119 |
+
strict_prompt = (
|
120 |
+
"You are an information extraction engine. "
|
121 |
+
"Extract legal entities, events, relationships, and evidence anchors from the input text. "
|
122 |
+
"Return ONLY a JSON array, no prose, no markdown, no comments. "
|
123 |
+
"Schema per item: {"
|
124 |
+
"\"class\": one of [\"party\",\"event\",\"date\",\"relation\",\"evidence\"], "
|
125 |
+
"\"text\": string (exact span), "
|
126 |
+
"\"attributes\": object (key-value), "
|
127 |
+
"\"source_hint\": string (optional page/line) "
|
128 |
+
"}."
|
129 |
+
)
|
130 |
+
|
131 |
+
# 3) 精简可运行的 few-shot(与法律场景贴近)
|
132 |
examples = [
|
133 |
lx.data.ExampleData(
|
134 |
+
text="On 15 February 2022, Dr Gavin Soo completed a medicolegal report to Walker Law Group.",
|
135 |
extractions=[
|
136 |
lx.data.Extraction(
|
137 |
extraction_class="party",
|
138 |
+
extraction_text="Walker Law Group",
|
139 |
+
attributes={"role": "law_firm"},
|
140 |
),
|
141 |
lx.data.Extraction(
|
142 |
extraction_class="event",
|
143 |
+
extraction_text="completed a medicolegal report",
|
144 |
+
attributes={"actor": "Dr Gavin Soo"},
|
145 |
+
),
|
146 |
+
lx.data.Extraction(
|
147 |
+
extraction_class="date",
|
148 |
+
extraction_text="15 February 2022",
|
149 |
+
attributes={}
|
150 |
),
|
151 |
],
|
152 |
)
|
153 |
]
|
154 |
|
155 |
+
# 4) 先跑一次;若解析失败,再以更强硬提示重试一次
|
156 |
+
work = Path(tempfile.mkdtemp(prefix="lx_"))
|
157 |
+
jsonl_path = work / "extractions.jsonl"
|
158 |
+
html_path = work / "review.html"
|
159 |
+
raw_path1 = work / "raw_attempt1.txt"
|
160 |
+
raw_path2 = work / "raw_attempt2.txt"
|
161 |
+
|
162 |
+
def _try_extract(prompt_text):
|
163 |
+
# LangExtract 没有公开 raw 输出参数,我们用 try/except 捕获并让其保存在日志(同时缩短输入验证)
|
164 |
+
return lx.extract(
|
165 |
+
text_or_documents=sidecar_text[:15000], # 先限长,避免超长触发安全策略
|
166 |
+
prompt_description=prompt_text.strip(),
|
167 |
+
examples=examples,
|
168 |
+
model_id=model_id.strip(),
|
169 |
+
api_key=api_key,
|
170 |
+
fence_output=fence_output,
|
171 |
+
use_schema_constraints=use_schema_constraints,
|
172 |
+
)
|
173 |
+
|
174 |
with st.status("正在进行结构化抽取 …", expanded=False) as s:
|
175 |
try:
|
176 |
+
result = _try_extract(strict_prompt)
|
177 |
+
except Exception as e1:
|
178 |
+
# 第一次失败:很可能是返回了非 JSON。我们把提示再加强,强调 “only JSON array”
|
179 |
+
hard_prompt = strict_prompt + " Output must be a compact JSON array. Do not include any other text."
|
180 |
+
try:
|
181 |
+
result = _try_extract(hard_prompt)
|
182 |
+
except Exception as e2:
|
183 |
+
s.update(label="抽取失败", state="error")
|
184 |
+
return None, None, f"LangExtract 抽取失败:{e2}"
|
185 |
+
|
186 |
+
# 保存结果并可视化
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
try:
|
188 |
lx.io.save_annotated_documents([result], output_name=str(jsonl_path))
|
189 |
html_content = lx.visualize(str(jsonl_path))
|