MCP_Res / mcp /nlp.py
mgbam's picture
Update mcp/nlp.py
0375bcb verified
raw
history blame
770 Bytes
# mcp/nlp.py
import spacy
def load_model():
# Try scispacy first (if available)
try:
return spacy.load("en_core_sci_sm")
except OSError:
pass # Not installed, try generic spaCy
try:
return spacy.load("en_core_web_sm")
except OSError:
raise RuntimeError(
"No spaCy model found! Please install 'en_core_sci_sm' (preferred) or 'en_core_web_sm' "
"in your Dockerfile or requirements.txt before running the app."
)
nlp = load_model()
def extract_keywords(text: str):
"""Extract biomedical or general entities from text."""
doc = nlp(text)
# Only keep entities longer than 2 characters, no dups
return list(set(ent.text for ent in doc.ents if len(ent.text.strip()) > 2))