Spaces:

mgbam
/

UMLS

Running

App Files Files Community

mgbam commited on 11 days ago

Commit

afa884d

verified ·

1 Parent(s): 09f0258

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -78

app.py CHANGED Viewed

@@ -1,28 +1,22 @@
 import os
 import streamlit as st
-import requests
 from transformers import AutoTokenizer, AutoModel
 import torch
 import numpy as np
 # Page configuration
-st.set_page_config(page_title="KRISSBERT UMLS Linker", layout="wide")
-st.title("🧬 KRISSBERT + UMLS Entity Linker on Hugging Face Spaces")
-# Environment variables
-UMLS_API_KEY = os.getenv("UMLS_API_KEY")
-if not UMLS_API_KEY:
-    st.error("❗ Please set the UMLS_API_KEY as a secret in your Space.")
-    st.stop()
-# UMLS API endpoints
-TGT_URL = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
-SERVICE = "http://umlsks.nlm.nih.gov"
-SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"
-CONTENT_URL = "https://uts-ws.nlm.nih.gov/rest/content/current/"
-# Load KRISSBERT model
-MODEL_NAME = "microsoft/BiomedNLP-KRISSBERT-PubMed-UMLS-EL"
 @st.cache_resource
 def load_model():
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
@@ -32,80 +26,58 @@ def load_model():
 tokenizer, model = load_model()
-# Functions for UMLS API authentication
-def get_tgt(api_key):
-    resp = requests.post(TGT_URL, data={"apikey": api_key})
-    if resp.status_code == 201:
-        return resp.headers.get('location')
-    else:
-        st.error("Failed to obtain TGT from UMLS API.")
-        st.stop()
-@st.cache_data(ttl=3600)
-def get_st(tgt):
-    resp = requests.post(tgt, data={"service": SERVICE})
-    if resp.status_code == 200:
-        return resp.text
-    else:
-        st.error("Failed to obtain service ticket from UMLS API.")
-        st.stop()
-# Text embedding (tokenizer and model are unhashable, prefix with underscore)
 @st.cache_resource
 def embed_text(text, _tokenizer, _model):
-    inputs = _tokenizer(text, return_tensors="pt", truncation=True, padding=True)
     with torch.no_grad():
         outputs = _model(**inputs)
     emb = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
     return emb / np.linalg.norm(emb)
-# UI: Input box and examples
-st.markdown("Enter a biomedical sentence to link entities via UMLS API + KRISSBERT:")
 examples = [
-    "The patient was administered metformin for type 2 diabetes.",
-    "ER crowding has become a widespread issue in hospitals.",
-    "Tamoxifen is used in the treatment of ER-positive breast cancer."
 ]
-selected = st.selectbox("🔍 Example queries", ["Choose..."] + examples)
-sentence = st.text_area("📝 Sentence:", value=(selected if selected != "Choose..." else ""))
-if st.button("🔗 Link Entities"):
     if not sentence.strip():
-        st.warning("Please enter a sentence first.")
     else:
-        with st.spinner("Querying UMLS API and ranking... 🧠"):
-            # Authenticate
-            tgt = get_tgt(UMLS_API_KEY)
-            sticket = get_st(tgt)
-            # UMLS search for mentions
-            params = {"string": sentence, "ticket": sticket}
-            search_resp = requests.get(SEARCH_URL, params=params)
-            search_resp.raise_for_status()
-            results = search_resp.json().get("result", {}).get("results", [])
-            candidates = []
-            for res in results[:10]:
-                rui = res.get("ui")
-                name = res.get("name")
-                content_resp = requests.get(
-                    f"{CONTENT_URL}{rui}", params={"ticket": sticket}
-                )
-                definition = content_resp.json().get("result", {}).get("definition", "") if content_resp.status_code == 200 else ""
-                candidates.append({"ui": rui, "name": name, "definition": definition})
-            # Embed and score
-            sent_emb = embed_text(sentence, tokenizer, model)
-            for cand in candidates:
-                cand_emb = embed_text(cand['name'], tokenizer, model)
-                cand['score'] = float(np.dot(sent_emb, cand_emb))
-            ranked = sorted(candidates, key=lambda x: x['score'], reverse=True)[:5]
-            # Display
-            st.success("Top UMLS candidates:")
-            for item in ranked:
-                st.markdown(f"**{item['name']}** (CUI: `{item['ui']}`) — score: {item['score']:.3f}")
                 if item['definition']:
-                    st.markdown(f"> {item['definition']}\n")
-                st.markdown("---")

 import os
+import json
 import streamlit as st
 from transformers import AutoTokenizer, AutoModel
 import torch
 import numpy as np
+import faiss
 # Page configuration
+st.set_page_config(page_title='KRISSBERT UMLS Linker', layout='wide')
+st.title('🧬 KRISSBERT + UMLS Entity Linker (Local FAISS)')
+# File paths
+METADATA_PATH = 'umls_metadata.json'
+EMBED_PATH = 'umls_embeddings.npy'
+INDEX_PATH = 'umls_index.faiss'
+MODEL_NAME = 'microsoft/BiomedNLP-KRISSBERT-PubMed-UMLS-EL'
+# Load model & tokenizer
 @st.cache_resource
 def load_model():
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 tokenizer, model = load_model()
+# Load UMLS FAISS index + metadata
+@st.cache_resource
+def load_umls_index():
+    meta = json.load(open(METADATA_PATH, 'r'))
+    embeddings = np.load(EMBED_PATH)
+    index = faiss.read_index(INDEX_PATH)
+    return index, meta
+faiss_index, umls_meta = load_umls_index()
+# Embed text
 @st.cache_resource
 def embed_text(text, _tokenizer, _model):
+    inputs = _tokenizer(text, return_tensors='pt', truncation=True, padding=True)
     with torch.no_grad():
         outputs = _model(**inputs)
     emb = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
     return emb / np.linalg.norm(emb)
+# UI: examples and input
+st.markdown('Enter a biomedical sentence to link entities via local UMLS FAISS index and KRISSBERT:')
 examples = [
+    'The patient was administered metformin for type 2 diabetes.',
+    'ER crowding has become a widespread issue in hospitals.',
+    'Tamoxifen is used in the treatment of ER-positive breast cancer.'
 ]
+selected = st.selectbox('🔍 Example queries', ['Choose...'] + examples)
+sentence = st.text_area('📝 Sentence:', value=(selected if selected != 'Choose...' else ''))
+if st.button('🔗 Link Entities'):
     if not sentence.strip():
+        st.warning('Please enter a sentence first.')
     else:
+        with st.spinner('Embedding sentence and searching FAISS…'):
+            sent_emb = embed_text(sentence, tokenizer, model).reshape(1, -1)
+            distances, indices = faiss_index.search(sent_emb, 5)
+            results = []
+            for idx in indices[0]:
+                entry = umls_meta.get(str(idx), {})
+                results.append({
+                    'cui': entry.get('cui', ''),
+                    'name': entry.get('name', ''),
+                    'definition': entry.get('definition', ''),
+                    'source': entry.get('source', '')
+                })
+        # Display
+        if results:
+            st.success('Top UMLS candidates:')
+            for item in results:
+                st.markdown('**' + item['name'] + '** (CUI: `' + item['cui'] + '`)')
                 if item['definition']:
+                    st.markdown('> ' + item['definition'] + '\n')
+                st.markdown('_Source: ' + item['source'] + '_\n---')
+        else:
+            st.info('No matches found in UMLS index.')