Spaces:

mgbam
/

MCP_Res

Runtime error

App Files Files Community

mgbam commited on 25 days ago

Commit

1974999

verified ·

1 Parent(s): 90abc98

Update mcp/nlp.py

Browse files

Files changed (1) hide show

mcp/nlp.py +45 -9

mcp/nlp.py CHANGED Viewed

@@ -1,19 +1,55 @@
 # mcp/nlp.py
 import spacy
-def load_model():
     try:
         return spacy.load("en_core_web_sm")
-    except OSError:
         raise RuntimeError(
-            "spaCy model 'en_core_web_sm' is not installed. "
-            "Ensure Dockerfile downloads it at build time."
-        )
-nlp = load_model()
-def extract_keywords(text: str):
-    """Extract unique Named Entities longer than 2 characters."""
     doc = nlp(text)
-    return list({ent.text for ent in doc.ents if len(ent.text.strip()) > 2})

 # mcp/nlp.py
+#!/usr/bin/env python3
+"""MedGenesis – spaCy helper for lightweight keyword extraction.
+Features
+~~~~~~~~
+* Lazy‑loads **`en_core_web_sm`** at first call; cached thereafter.
+* If model missing, raises actionable RuntimeError — Dockerfile must
+  install via `python -m spacy download en_core_web_sm` (already in Dockerfile).
+* `extract_keywords` returns **unique named‑entity strings** (>2 chars)
+  stripped of whitespace, preserving original casing.
+* Adds fallback to simple noun‑chunk extraction when no entities found –
+  helps very short abstracts.
+"""
+from __future__ import annotations
 import spacy
+from functools import lru_cache
+from typing import List
+# ---------------------------------------------------------------------
+# Model loader (cached)
+# ---------------------------------------------------------------------
+@lru_cache(maxsize=1)
+def _load_model():
     try:
         return spacy.load("en_core_web_sm")
+    except OSError as e:
         raise RuntimeError(
+            "spaCy model 'en_core_web_sm' is not installed. Add\n"
+            "    RUN python -m spacy download en_core_web_sm\n"
+            "to your Dockerfile build stage."
+        ) from e
+# ---------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------
+def extract_keywords(text: str, *, min_len: int = 3) -> List[str]:
+    """Return de‑duplicated entity keywords (fallback noun chunks)."""
+    nlp = _load_model()
     doc = nlp(text)
+    ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_len}
+    if ents:
+        return list(ents)
+    # Fallback: noun chunks if spaCy found no entities (rare for tiny texts)
+    chunks = {chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) >= min_len}
+    return list(chunks)