mgbam commited on
Commit
1974999
·
verified ·
1 Parent(s): 90abc98

Update mcp/nlp.py

Browse files
Files changed (1) hide show
  1. mcp/nlp.py +45 -9
mcp/nlp.py CHANGED
@@ -1,19 +1,55 @@
1
  # mcp/nlp.py
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import spacy
 
 
 
 
 
 
 
4
 
5
- def load_model():
 
6
  try:
7
  return spacy.load("en_core_web_sm")
8
- except OSError:
9
  raise RuntimeError(
10
- "spaCy model 'en_core_web_sm' is not installed. "
11
- "Ensure Dockerfile downloads it at build time."
12
- )
 
13
 
14
- nlp = load_model()
15
 
16
- def extract_keywords(text: str):
17
- """Extract unique Named Entities longer than 2 characters."""
 
 
 
 
 
18
  doc = nlp(text)
19
- return list({ent.text for ent in doc.ents if len(ent.text.strip()) > 2})
 
 
 
 
 
 
 
 
 
1
  # mcp/nlp.py
2
 
3
+ #!/usr/bin/env python3
4
+ """MedGenesis – spaCy helper for lightweight keyword extraction.
5
+
6
+ Features
7
+ ~~~~~~~~
8
+ * Lazy‑loads **`en_core_web_sm`** at first call; cached thereafter.
9
+ * If model missing, raises actionable RuntimeError — Dockerfile must
10
+ install via `python -m spacy download en_core_web_sm` (already in Dockerfile).
11
+ * `extract_keywords` returns **unique named‑entity strings** (>2 chars)
12
+ stripped of whitespace, preserving original casing.
13
+ * Adds fallback to simple noun‑chunk extraction when no entities found –
14
+ helps very short abstracts.
15
+ """
16
+ from __future__ import annotations
17
+
18
  import spacy
19
+ from functools import lru_cache
20
+ from typing import List
21
+
22
+
23
+ # ---------------------------------------------------------------------
24
+ # Model loader (cached)
25
+ # ---------------------------------------------------------------------
26
 
27
+ @lru_cache(maxsize=1)
28
+ def _load_model():
29
  try:
30
  return spacy.load("en_core_web_sm")
31
+ except OSError as e:
32
  raise RuntimeError(
33
+ "spaCy model 'en_core_web_sm' is not installed. Add\n"
34
+ " RUN python -m spacy download en_core_web_sm\n"
35
+ "to your Dockerfile build stage."
36
+ ) from e
37
 
 
38
 
39
+ # ---------------------------------------------------------------------
40
+ # Public API
41
+ # ---------------------------------------------------------------------
42
+
43
+ def extract_keywords(text: str, *, min_len: int = 3) -> List[str]:
44
+ """Return de‑duplicated entity keywords (fallback noun chunks)."""
45
+ nlp = _load_model()
46
  doc = nlp(text)
47
+
48
+ ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_len}
49
+ if ents:
50
+ return list(ents)
51
+
52
+ # Fallback: noun chunks if spaCy found no entities (rare for tiny texts)
53
+ chunks = {chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) >= min_len}
54
+ return list(chunks)
55
+