Update mcp/nlp.py
Browse files- mcp/nlp.py +45 -9
mcp/nlp.py
CHANGED
@@ -1,19 +1,55 @@
|
|
1 |
# mcp/nlp.py
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import spacy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
|
|
|
6 |
try:
|
7 |
return spacy.load("en_core_web_sm")
|
8 |
-
except OSError:
|
9 |
raise RuntimeError(
|
10 |
-
"spaCy model 'en_core_web_sm' is not installed. "
|
11 |
-
"
|
12 |
-
|
|
|
13 |
|
14 |
-
nlp = load_model()
|
15 |
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
18 |
doc = nlp(text)
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# mcp/nlp.py
|
2 |
|
3 |
+
#!/usr/bin/env python3
|
4 |
+
"""MedGenesis – spaCy helper for lightweight keyword extraction.
|
5 |
+
|
6 |
+
Features
|
7 |
+
~~~~~~~~
|
8 |
+
* Lazy‑loads **`en_core_web_sm`** at first call; cached thereafter.
|
9 |
+
* If model missing, raises actionable RuntimeError — Dockerfile must
|
10 |
+
install via `python -m spacy download en_core_web_sm` (already in Dockerfile).
|
11 |
+
* `extract_keywords` returns **unique named‑entity strings** (>2 chars)
|
12 |
+
stripped of whitespace, preserving original casing.
|
13 |
+
* Adds fallback to simple noun‑chunk extraction when no entities found –
|
14 |
+
helps very short abstracts.
|
15 |
+
"""
|
16 |
+
from __future__ import annotations
|
17 |
+
|
18 |
import spacy
|
19 |
+
from functools import lru_cache
|
20 |
+
from typing import List
|
21 |
+
|
22 |
+
|
23 |
+
# ---------------------------------------------------------------------
|
24 |
+
# Model loader (cached)
|
25 |
+
# ---------------------------------------------------------------------
|
26 |
|
27 |
+
@lru_cache(maxsize=1)
|
28 |
+
def _load_model():
|
29 |
try:
|
30 |
return spacy.load("en_core_web_sm")
|
31 |
+
except OSError as e:
|
32 |
raise RuntimeError(
|
33 |
+
"spaCy model 'en_core_web_sm' is not installed. Add\n"
|
34 |
+
" RUN python -m spacy download en_core_web_sm\n"
|
35 |
+
"to your Dockerfile build stage."
|
36 |
+
) from e
|
37 |
|
|
|
38 |
|
39 |
+
# ---------------------------------------------------------------------
|
40 |
+
# Public API
|
41 |
+
# ---------------------------------------------------------------------
|
42 |
+
|
43 |
+
def extract_keywords(text: str, *, min_len: int = 3) -> List[str]:
|
44 |
+
"""Return de‑duplicated entity keywords (fallback noun chunks)."""
|
45 |
+
nlp = _load_model()
|
46 |
doc = nlp(text)
|
47 |
+
|
48 |
+
ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_len}
|
49 |
+
if ents:
|
50 |
+
return list(ents)
|
51 |
+
|
52 |
+
# Fallback: noun chunks if spaCy found no entities (rare for tiny texts)
|
53 |
+
chunks = {chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) >= min_len}
|
54 |
+
return list(chunks)
|
55 |
+
|