mgbam commited on
Commit
4f7b321
·
verified ·
1 Parent(s): e9da206

Update mcp/umls.py

Browse files
Files changed (1) hide show
  1. mcp/umls.py +119 -61
mcp/umls.py CHANGED
@@ -1,94 +1,152 @@
1
- # mcp/umls.py
2
- """
3
- Biomedical keyword/concept extractor for UMLS lookup.
 
 
 
 
4
 
5
- - Uses SciSpaCy if available (best for biomedical text).
6
- - Falls back to spaCy 'en_core_web_sm' (less accurate, general English).
7
- - Final fallback: regex keyword extraction.
8
- """
 
 
9
 
10
- import os, httpx, asyncio
11
- from functools import lru_cache
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # UMLS API config
14
- UMLS_KEY = os.getenv("UMLS_KEY")
15
- _AUTH_URL = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
16
- _SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"
17
 
18
- # ------- Robust concept extractor ------------------------------------------
19
- def extract_umls_concepts(text: str, min_length: int = 3) -> list[str]:
20
  """
21
- Extract biomedical concepts (for UMLS) from text.
22
- Priority: SciSpaCy -> spaCy -> regex.
 
 
 
 
23
  """
 
24
  try:
25
- # Try SciSpaCy first (best for biomedical NER)
26
- import scispacy
27
- import spacy
28
- try:
29
- nlp = spacy.load("en_ner_bionlp13cg_md")
30
- except Exception:
31
- nlp = spacy.load("en_core_sci_sm")
32
- doc = nlp(text)
33
- # All entities ≥ min_length, deduplicated
34
- ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
35
- if ents:
36
- return list(ents)
37
  except Exception:
 
38
  pass
39
 
40
- # Fallback: spaCy general NER
41
  try:
42
- import spacy
43
- nlp = spacy.load("en_core_web_sm")
44
- doc = nlp(text)
45
- ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
46
- if ents:
47
- return list(ents)
48
  except Exception:
49
  pass
50
 
51
- # Final fallback: Regex keywords
52
- import re
53
- words = re.findall(r"\b[a-zA-Z0-9\-]+\b", text)
54
- return list({w for w in words if len(w) >= min_length})
55
 
56
- # ------- UMLS API lookup (same as before, safe for missing/invalid key) ----
57
- async def _get_ticket() -> str | None:
58
- if not UMLS_KEY:
 
 
 
 
 
 
59
  return None
 
60
  try:
61
- async with httpx.AsyncClient(timeout=10) as c:
62
- tgt = await c.post(_AUTH_URL, data={"apikey": UMLS_KEY})
63
- tgt.raise_for_status()
64
- action = tgt.text.split('action="')[1].split('"')[0]
65
- st = await c.post(action, data={"service": "http://umlsks.nlm.nih.gov"})
66
- return st.text
 
 
 
 
67
  except Exception:
68
  return None
69
 
 
70
  @lru_cache(maxsize=512)
71
- async def lookup_umls(term: str) -> dict:
72
  """
73
- Return {term,cui,name,definition}.
74
- If auth/quota fails returns all keys as None (safe for UI).
 
75
  """
76
- ticket = await _get_ticket()
77
  if not ticket:
78
  return {"term": term, "cui": None, "name": None, "definition": None}
79
 
80
  params = {"string": term, "ticket": ticket, "pageSize": 1}
81
  try:
82
- async with httpx.AsyncClient(timeout=8) as c:
83
- r = await c.get(_SEARCH_URL, params=params)
84
- r.raise_for_status()
85
- items = r.json().get("result", {}).get("results", [])
86
- hit = items[0] if items else {}
87
  return {
88
  "term": term,
89
- "cui": hit.get("ui"),
90
- "name": hit.get("name"),
91
- "definition": hit.get("rootSource"),
92
  }
93
  except Exception:
94
  return {"term": term, "cui": None, "name": None, "definition": None}
 
1
+ import os
2
+ import re
3
+ import httpx
4
+ import asyncio
5
+ from functools import lru_cache
6
+ from pathlib import Path
7
+ from typing import List, Optional, Dict, Any
8
 
9
+ # ---------------------------------------------------------------------------
10
+ # Configuration
11
+ # ---------------------------------------------------------------------------
12
+ UMLS_API_KEY = os.getenv("UMLS_KEY")
13
+ UMLS_AUTH_URL = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
14
+ UMLS_SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"
15
 
16
+ # ---------------------------------------------------------------------------
17
+ # Named types
18
+ # ---------------------------------------------------------------------------
19
+ class UMLSResult(Dict[str, Optional[str]]):
20
+ """
21
+ Represents a single UMLS lookup result.
22
+ Keys: term, cui, name, definition
23
+ """
24
+ pass
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # NLP model loading with caching
28
+ # ---------------------------------------------------------------------------
29
+ @lru_cache(maxsize=None)
30
+ def _load_spacy_model(model_name: str):
31
+ import spacy
32
+ return spacy.load(model_name)
33
+
34
+ @lru_cache(maxsize=None)
35
+ def _load_scispacy_model():
36
+ # Prefer the BioNLP model; fall back to the smaller sci model
37
+ try:
38
+ return _load_spacy_model("en_ner_bionlp13cg_md")
39
+ except Exception:
40
+ return _load_spacy_model("en_core_sci_sm")
41
+
42
+ @lru_cache(maxsize=None)
43
+ def _load_general_spacy():
44
+ return _load_spacy_model("en_core_web_sm")
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # Concept extraction utilities
48
+ # ---------------------------------------------------------------------------
49
+ def _extract_entities(nlp, text: str, min_length: int) -> List[str]:
50
+ """
51
+ Run a spaCy nlp pipeline over text and return unique entity texts
52
+ of at least min_length.
53
+ """
54
+ doc = nlp(text)
55
+ ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
56
+ return list(ents)
57
+
58
+
59
+ def _regex_fallback(text: str, min_length: int) -> List[str]:
60
+ """
61
+ Simple regex-based token extraction for fallback.
62
+ """
63
+ tokens = re.findall(r"\b[a-zA-Z0-9\-]+\b", text)
64
+ return list({t for t in tokens if len(t) >= min_length})
65
 
 
 
 
 
66
 
67
+ def extract_umls_concepts(text: str, min_length: int = 3) -> List[str]:
 
68
  """
69
+ Extract biomedical concepts from text in priority order:
70
+ 1. SciSpaCy (en_ner_bionlp13cg_md or en_core_sci_sm)
71
+ 2. spaCy general NER (en_core_web_sm)
72
+ 3. Regex tokens
73
+
74
+ Guaranteed to return a list of unique strings.
75
  """
76
+ # 1) SciSpaCy pipeline
77
  try:
78
+ scispacy_nlp = _load_scispacy_model()
79
+ entities = _extract_entities(scispacy_nlp, text, min_length)
80
+ if entities:
81
+ return entities
82
+ except ImportError:
83
+ # SciSpaCy not installed
84
+ pass
 
 
 
 
 
85
  except Exception:
86
+ # Unexpected failure in scispacy
87
  pass
88
 
89
+ # 2) General spaCy pipeline
90
  try:
91
+ general_nlp = _load_general_spacy()
92
+ entities = _extract_entities(general_nlp, text, min_length)
93
+ if entities:
94
+ return entities
 
 
95
  except Exception:
96
  pass
97
 
98
+ # 3) Regex fallback
99
+ return _regex_fallback(text, min_length)
 
 
100
 
101
+ # ---------------------------------------------------------------------------
102
+ # UMLS API integration
103
+ # ---------------------------------------------------------------------------
104
+ async def _get_umls_ticket() -> Optional[str]:
105
+ """
106
+ Obtain a UMLS service ticket for subsequent queries.
107
+ Returns None if API key is missing or authentication fails.
108
+ """
109
+ if not UMLS_API_KEY:
110
  return None
111
+
112
  try:
113
+ async with httpx.AsyncClient(timeout=10) as client:
114
+ response = await client.post(
115
+ UMLS_AUTH_URL, data={"apikey": UMLS_API_KEY}
116
+ )
117
+ response.raise_for_status()
118
+ tgt_url = response.text.split('action="')[1].split('"')[0]
119
+ service_resp = await client.post(
120
+ tgt_url, data={"service": "http://umlsks.nlm.nih.gov"}
121
+ )
122
+ return service_resp.text
123
  except Exception:
124
  return None
125
 
126
+
127
  @lru_cache(maxsize=512)
128
+ async def lookup_umls(term: str) -> UMLSResult:
129
  """
130
+ Look up a term in the UMLS API.
131
+ Returns a dict containing the original term, its CUI, preferred name, and definition.
132
+ On failure or quota issues, returns all values except 'term' as None.
133
  """
134
+ ticket = await _get_umls_ticket()
135
  if not ticket:
136
  return {"term": term, "cui": None, "name": None, "definition": None}
137
 
138
  params = {"string": term, "ticket": ticket, "pageSize": 1}
139
  try:
140
+ async with httpx.AsyncClient(timeout=8) as client:
141
+ resp = await client.get(UMLS_SEARCH_URL, params=params)
142
+ resp.raise_for_status()
143
+ results = resp.json().get("result", {}).get("results", [])
144
+ first = results[0] if results else {}
145
  return {
146
  "term": term,
147
+ "cui": first.get("ui"),
148
+ "name": first.get("name"),
149
+ "definition": first.get("definition") or first.get("rootSource"),
150
  }
151
  except Exception:
152
  return {"term": term, "cui": None, "name": None, "definition": None}