Spaces:

ayushnoori
/

clinical-drug-repurposing

Sleeping

App Files Files Community

ayushnoori commited on Jun 28, 2024

Commit

d4ca2d2

1 Parent(s): dc3f347

Initial commit

Browse files

Files changed (32) hide show

.gitattributes +3 -0
.gitignore +21 -0
.streamlit/config.toml +5 -0
README.md +4 -4
app.py +142 -0
data/kg_edge_types.csv +3 -0
data/kg_edges.csv +3 -0
data/kg_node_types.csv +3 -0
data/kg_nodes.csv +3 -0
media/about_header.svg +1 -0
media/explore_header.svg +1 -0
media/gravity_logo.png +0 -0
media/gravity_logo.svg +1 -0
media/input_header.svg +1 -0
media/pfp/anoori.png +3 -0
media/pfp/gravity.png +3 -0
media/pfp/mzitnik.png +3 -0
media/pfp/ndagan.png +3 -0
media/pfp/rbalicer.png +3 -0
media/predict_header.svg +1 -0
media/validate_header.svg +1 -0
menu.py +51 -0
pages/about.py +30 -0
pages/admin.py +13 -0
pages/input.py +253 -0
pages/predict.py +274 -0
pages/split.py +114 -0
pages/validate.py +124 -0
project_config.py +44 -0
requirements.txt +11 -0
sync_data.sh +51 -0
utils.py +28 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+media/pfp/*.png filter=lfs diff=lfs merge=lfs -text
+data/*.csv filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,21 @@

+# Ignore Mac temporary files
+*.DS_Store
+.DS_Store
+# Ignore python cache files
+__pycache__/
+# Ignore code
+code/*
+# Ignore model files
+data/*.pt
+data/disease_splits/*
+models/embeddings/*
+models/checkpoints/*
+# Ignore secrets
+.streamlit/secrets.toml
+# Ignore user DB
+auth/*

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,5 @@

+[client]
+showSidebarNavigation = false
+[theme]
+base="light"

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: Clinical Drug Repurposing
-emoji: 🐢
-colorFrom: gray
-colorTo: green
 sdk: streamlit
-sdk_version: 1.36.0
 app_file: app.py
 pinned: false
 ---

 ---
 title: Clinical Drug Repurposing
+emoji: ⚕️
+colorFrom: red
+colorTo: purple
 sdk: streamlit
+sdk_version: 1.34.0
 app_file: app.py
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import streamlit as st
+# User authentication
+import gspread
+from oauth2client.service_account import ServiceAccountCredentials
+import hmac
+# Standard imports
+import pandas as pd
+# Custom and other imports
+import project_config
+from menu import menu
+# Initialize st.session_state.role to None
+if "role" not in st.session_state:
+    st.session_state.role = None
+# From https://stackoverflow.com/questions/55961295/serviceaccountcredentials-from-json-keyfile-name-equivalent-for-remote-json
+# See also https://www.slingacademy.com/article/pandas-how-to-read-and-update-google-sheet-files/
+# See also https://docs.streamlit.io/develop/tutorials/databases/private-gsheet
+# Note that the secrets cannot be passed in a group in HuggingFace Spaces,
+#   which is required for the native Streamlit implementation
+def create_keyfile_dict():
+    variables_keys = {
+        # "spreadsheet": st.secrets['spreadsheet'], # spreadsheet
+        "type": st.secrets['type'], # type
+        "project_id": st.secrets['project_id'], # project_id
+        "private_key_id": st.secrets['private_key_id'], # private_key_id
+        # Have to replace \n with new lines (^l in Word) by hand
+        "private_key": st.secrets['private_key'], # private_key
+        "client_email": st.secrets['client_email'], # client_email
+        "client_id": st.secrets['client_id'], # client_id
+        "auth_uri": st.secrets['auth_uri'], # auth_uri
+        "token_uri": st.secrets['token_uri'], # token_uri
+        "auth_provider_x509_cert_url": st.secrets['auth_provider_x509_cert_url'], # auth_provider_x509_cert_url
+        "client_x509_cert_url": st.secrets['client_x509_cert_url'], # client_x509_cert_url
+        "universe_domain": st.secrets['universe_domain'] # universe_domain
+    }
+    return variables_keys
+def check_password():
+    """Returns `True` if the user had a correct password."""
+    def login_form():
+        """Form with widgets to collect user information"""
+        # Header
+        col1, col2, col3 = st.columns(3)
+        with col2:
+            st.image(str(project_config.MEDIA_DIR / 'gravity_logo.svg'), width=300)
+        with st.form("Credentials"):
+            st.text_input("Username", key="username")
+            st.text_input("Password", type="password", key="password")
+            st.form_submit_button("Log In", on_click=password_entered)
+    def password_entered():
+        """Checks whether a password entered by the user is correct."""
+        if project_config.VDI or project_config.LOCAL:
+            # Read the user database
+            user_db = pd.read_csv(project_config.AUTH_DIR / "crd_user_db.csv")
+        else:
+            # Define the scope
+            scope = [
+                'https://spreadsheets.google.com/feeds',
+                'https://www.googleapis.com/auth/drive'
+            ]
+            # Add credentials to the account
+            creds = ServiceAccountCredentials.from_json_keyfile_dict(create_keyfile_dict(), scope)
+            # Authenticate and create the client
+            client = gspread.authorize(creds)
+            # Open the spreadsheet
+            sheet = client.open_by_url(st.secrets['spreadsheet']).worksheet("user_db")
+            data = sheet.get_all_records()
+            user_db = pd.DataFrame(data)
+        # Check if the username is in the database
+        if st.session_state["username"] in user_db.username.values:
+            st.session_state["username_correct"] = True
+            # Check if the password is correct
+            if hmac.compare_digest(
+                st.session_state["password"],
+                user_db.loc[user_db.username == st.session_state["username"], "password"].values[0],
+            ):
+                st.session_state["password_correct"] = True
+                # Check if the username is an admin
+                if st.session_state["username"] in user_db[user_db.role == "admin"].username.values:
+                    st.session_state["role"] = "admin"
+                else:
+                    st.session_state["role"] = "user"
+                # Retrieve and store user name and team
+                st.session_state["name"] = user_db.loc[user_db.username == st.session_state["username"], "name"].values[0]
+                st.session_state["team"] = user_db.loc[user_db.username == st.session_state["username"], "team"].values[0]
+                st.session_state["profile_pic"] = st.session_state["username"]
+                # Don't store the password
+                del st.session_state["password"]
+            else:
+                st.session_state["password_correct"] = False
+        else:
+            st.session_state["username_correct"] = False
+            st.session_state["password_correct"] = False
+    # Return True if the username + password is validated
+    if st.session_state.get("password_correct", False):
+        return True
+    # Show inputs for username + password
+    login_form()
+    if "password_correct" in st.session_state:
+        if not st.session_state["username_correct"]:
+            st.error("User not found.")
+        elif not st.session_state["password_correct"]:
+            st.error("The password you entered is incorrect.")
+        else:
+            st.error("An unexpected error occurred.")
+    return False
+menu() # Render the dynamic menu!
+if not check_password():
+    st.stop()
+st.switch_page("pages/about.py")

data/kg_edge_types.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ee79b2f5021304a4dd82581568e8a8c940f94b29cd1206f7730bdff6b82cab4
+size 5288

data/kg_edges.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab7d0e23c56381abf8e214cc5d4fae4e6a8b98957c8f2e5272b4f800953b1461
+size 2765378133

data/kg_node_types.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1a0afff52deec5f48689a22a479d14cd49333759e054624366687ec4ef306c8
+size 192

data/kg_nodes.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a21c42a1ee345195038d438854ee5d4befa7b1984e5efa4865ed74825a75b6d9
+size 8529743

media/about_header.svg ADDED Viewed

media/explore_header.svg ADDED Viewed

media/gravity_logo.png ADDED Viewed

media/gravity_logo.svg ADDED Viewed

media/input_header.svg ADDED Viewed

media/pfp/anoori.png ADDED Viewed

Git LFS Details

SHA256: 56f2cd51f6496ff1e43f0ce3fb63145a442772b16e3d456bba06cf86d78671cf
Pointer size: 132 Bytes
Size of remote file: 1.53 MB

media/pfp/gravity.png ADDED Viewed

Git LFS Details

SHA256: 348a8c9cabd92f92e0e088f24c7ddb10120911c3c492f8e91baf02a870d464fd
Pointer size: 131 Bytes
Size of remote file: 114 kB

media/pfp/mzitnik.png ADDED Viewed

Git LFS Details

SHA256: b514858118909ce8004028a1f87f3e7a259415d730d34371830e884a1343da2f
Pointer size: 132 Bytes
Size of remote file: 1.2 MB

media/pfp/ndagan.png ADDED Viewed

Git LFS Details

SHA256: d7d169b3a4cceca7bcb829ae02ea5ce912c94b5b66006343f1d5bfdc7296ce79
Pointer size: 132 Bytes
Size of remote file: 1.07 MB

media/pfp/rbalicer.png ADDED Viewed

Git LFS Details

SHA256: c1b68144f018798483fb7485d9fa46350e6274bc9ece3d5548ec33226ca2690d
Pointer size: 131 Bytes
Size of remote file: 580 kB

media/predict_header.svg ADDED Viewed

media/validate_header.svg ADDED Viewed

menu.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# From https://docs.streamlit.io/develop/tutorials/multipage/st.page_link-nav
+import streamlit as st
+import os
+import project_config
+def authenticated_menu():
+    # Insert profile picture
+    pfp_path = str(project_config.MEDIA_DIR / 'pfp' / f"{st.session_state.profile_pic}.png")
+    if not os.path.exists(pfp_path):
+        pfp_path = str(project_config.MEDIA_DIR / 'pfp' / "gravity.png")
+    st.sidebar.image(pfp_path, use_column_width=True)
+    st.sidebar.markdown("---")
+    # Show a navigation menu for authenticated users
+    # st.sidebar.page_link("app.py", label="Switch Accounts", icon="🔒")
+    st.sidebar.page_link("pages/about.py", label="About", icon="📖")
+    st.sidebar.page_link("pages/input.py", label="Input", icon="💡")
+    st.sidebar.page_link("pages/predict.py", label="Predict", icon="🔍",
+                         disabled=("query" not in st.session_state))
+    st.sidebar.page_link("pages/validate.py", label="Validate", icon="✅",
+                         disabled=("query" not in st.session_state))
+    # st.sidebar.page_link("pages/explore.py", label="Explore", icon="🔍")
+    if st.session_state.role in ["admin"]:
+        st.sidebar.page_link("pages/admin.py", label="Manage Users", icon="🔧")
+    # Show the logout button
+    st.sidebar.markdown("---")
+    st.sidebar.button("Log Out", on_click=lambda: st.session_state.clear())
+def unauthenticated_menu():
+    # Show a navigation menu for unauthenticated users
+    st.sidebar.page_link("app.py", label="Log In", icon="🔒")
+def menu():
+    # Determine if a user is logged in or not, then show the correct navigation menu
+    if "role" not in st.session_state or st.session_state.role is None:
+        unauthenticated_menu()
+        return
+    authenticated_menu()
+def menu_with_redirect():
+    # Redirect users to the main page if not logged in, otherwise continue to
+    # render the navigation menu
+    if "role" not in st.session_state or st.session_state.role is None:
+        st.switch_page("app.py")
+    menu()

pages/about.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import streamlit as st
+from menu import menu_with_redirect
+# Path manipulation
+from pathlib import Path
+# Custom and other imports
+import project_config
+# Redirect to app.py if not logged in, otherwise show the navigation menu
+menu_with_redirect()
+# Header
+st.image(str(project_config.MEDIA_DIR / 'about_header.svg'), use_column_width=True)
+# Main content
+st.markdown(f"Hello, {st.session_state.name}! Welcome to GRAVITY, a **GR**aph **A**I **VI**sualization **T**ool to query and visualize knowledge graph-grounded biomedical AI models.")
+# Subheader
+st.subheader("Clinical Drug Repurposing", divider = "grey")
+st.markdown("""
+Here, we use GRAVITY to visualize the outputs of our clinical drug repurposing algorithm. The algorithm predicts the probability of a drug treating a disease based on the drug-disease relationship in the knowledge graph.
+""")
+col1, col2, col3 = st.columns(3)
+with col2:
+    if st.button("Make Predictions"):
+        st.switch_page("pages/input.py")

pages/admin.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import streamlit as st
+from menu import menu_with_redirect
+# Redirect to app.py if not logged in, otherwise show the navigation menu
+menu_with_redirect()
+# Verify the user's role
+if st.session_state.role not in ["admin"]:
+    st.warning("You do not have permission to view this page.")
+    st.stop()
+st.title("User Management")
+st.markdown(f"You are currently logged with the role of {st.session_state.role}.")

pages/input.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import streamlit as st
+from menu import menu_with_redirect
+# Standard imports
+import numpy as np
+import pandas as pd
+import subprocess
+# Path manipulation
+import os
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+# Custom and other imports
+import project_config
+from utils import load_kg
+# Redirect to app.py if not logged in, otherwise show the navigation menu
+menu_with_redirect()
+# Header
+st.image(str(project_config.MEDIA_DIR / 'input_header.svg'), use_column_width=True)
+st.subheader("Choose Disease Split", divider = "red")
+with st.spinner('Loading disease splits...'):
+    if project_config.VDI or project_config.LOCAL:
+        # Read disease splits
+        # Load from Kempner using sync_data.sh
+        disease_splits = pd.read_csv(project_config.DATA_DIR / 'disease_splits' / 'disease_splits.csv',
+                                    dtype = {'node_index': str, 'disease_split_index': str})
+    else:
+        # Read disease splits from HF
+        disease_splits = hf_hub_download(repo_id=project_config.HF_REPO,
+                                         filename='disease_split/disease_splits.csv',
+                                         token=st.secrets["HF_TOKEN"], repo_type="dataset")
+    # Group disease splits by disease_split_index column
+    disease_splits_grouped = disease_splits.groupby('disease_split_index').size().reset_index(name='node_count')
+    # Subset to unique disease splits
+    splits_df =disease_splits[disease_splits['node_index'] == disease_splits['disease_split_index']]
+    splits_df = splits_df.drop_duplicates(subset='disease_split_index').reset_index(drop=True)
+    splits_df = splits_df[['node_index', 'node_name', 'node_id']]
+    # Merge with counts
+    splits_df = splits_df.merge(disease_splits_grouped, left_on='node_index', right_on='disease_split_index', how='left')
+    splits_df = splits_df.drop(columns='disease_split_index')
+    splits_df['node_name'] = splits_df['node_name'].str.replace(' \\(disease\\)', '', regex=True)
+    # Add row for all to beginning
+    splits_df['node_index'] = splits_df['node_index'].astype(str)
+    splits_df = pd.concat([pd.DataFrame([['all', 'all diseases', None, disease_splits.shape[0]]], columns=splits_df.columns), splits_df], ignore_index=True)
+    # For each disease split, count number of edges (number of rows in CSV file in disease_splits directory)
+    # Do not read file in
+    edge_counts = []
+    for index, row in splits_df.iterrows():
+        # Count lines
+        file_name = project_config.DATA_DIR / 'disease_splits' / 'split_edges' /  f'{row["node_index"]}.csv'
+        edge_count = int(subprocess.check_output(['wc', '-l', file_name]).split()[0]) - 1
+        edge_counts.append(edge_count)
+    # Add edge counts to splits_df
+    splits_df['edge_count'] = edge_counts
+    # Get list of available modles
+    model_files = os.listdir(project_config.MODEL_DIR / 'embeddings')
+    model_files = [f for f in model_files if f.endswith('_embeddings.pt')]
+    # Get model metadata
+    def get_model_metadata(f):
+        # Get metadata
+        metadata = f.split('_')
+        date = '_'.join(metadata[:6])
+        date = pd.to_datetime(date, format='%Y_%m_%d_%H_%M_%S')
+        # Parameters
+        params = metadata[6].split('-')
+        params = {p.split('=')[0]: p.split('=')[1] for p in params}
+        # Add date to params
+        params['date'] = date
+        params['file'] = f
+        return params
+    # Get available models, only keep latest version per split
+    avail_models = pd.DataFrame([get_model_metadata(f) for f in model_files])
+    avail_models = avail_models.sort_values('date', ascending=False).drop_duplicates('test').reset_index(drop=True)
+    # avail_models.loc[avail_models['test'] == 'all', 'test'] = 'all diseases'
+    # Add column to indicate if model is available
+    splits_df['available'] = splits_df['node_index'].isin(avail_models['test'])
+    # If all diseases model is available, set all diseases to available
+    if avail_models['test'].str.contains('all').any():
+        splits_df.loc[splits_df['node_name'] == 'all diseases', 'available'] = True
+    ####################################################################################################
+    # Select disease split from splits with available models
+    # Make dictionary with node_index: node_name, where name is value shown but index is used for query
+    # split_options = splits_df[splits_df['available']].copy()
+    split_options = splits_df.copy()
+    split_options = split_options.set_index('node_index')['node_name'].to_dict()
+    # Check if split is in session state
+    if "split" not in st.session_state:
+        split_index = 0
+    else:
+        split_index = list(split_options.keys()).index(st.session_state.split)
+    split = st.selectbox("Disease Split", list(split_options.keys()), format_func = lambda x: split_options[x],
+                         index = split_index)
+    # Show all splits dataframe
+    splits_display = splits_df[['node_index', 'node_name', 'node_count', 'edge_count', 'available']].copy()
+    splits_display = splits_display.rename(columns = {'node_index': 'Split ID', 'node_name': 'Disease', 'node_count': 'Node Count', 'edge_count': 'Edge Count', 'available': 'Model Available'})
+    st.dataframe(splits_display, use_container_width = True, hide_index = True)
+    # Save split and available models to session state
+    st.session_state.split = split
+    st.session_state.splits_df = splits_df
+    st.session_state.avail_models = avail_models
+    if st.button("Explore Split"):
+        st.switch_page("pages/split.py")
+####################################################################################################
+st.subheader("Construct Query", divider = "red")
+# # Checkbox to allow reverse edges
+# allow_reverse_edges = st.checkbox("Allow reverse edges?", value = False)
+allow_reverse_edges = False
+# Load knowledge graph
+kg_nodes = load_kg()
+with st.spinner('Loading knowledge graph...'):
+    # kg_nodes = nodes = pd.read_csv(project_config.DATA_DIR / 'kg_nodes.csv', dtype = {'node_index': int}, low_memory = False)
+    node_types = pd.read_csv(project_config.DATA_DIR / 'kg_node_types.csv')
+    edge_types = pd.read_csv(project_config.DATA_DIR / 'kg_edge_types.csv')
+    if not allow_reverse_edges:
+       edge_types = edge_types[edge_types.direction == 'forward']
+# If query is not in session state, initialize it
+if "query" not in st.session_state:
+    source_node_type_index = 0
+    source_node_index = 0
+    target_node_type_index = 0
+    relation_index = 0
+    if st.session_state.team == "Clalit":
+        source_node_type_index = 2
+        source_node_index = 0
+        target_node_type_index = 3
+        relation_index = 2
+else:
+    source_node_type_index = st.session_state.query_options['source_node_type'].index(st.session_state.query['source_node_type'])
+    source_node_index = st.session_state.query_options['source_node'].index(st.session_state.query['source_node'])
+    target_node_type_index = st.session_state.query_options['target_node_type'].index(st.session_state.query['target_node_type'])
+    relation_index = st.session_state.query_options['relation'].index(st.session_state.query['relation'])
+# Select source node type
+source_node_type_options = node_types['node_type']
+source_node_type = st.selectbox("Source Node Type", source_node_type_options,
+                                format_func = lambda x: x.replace("_", " "), index = source_node_type_index)
+# If source node type is disease, add option to select only diseases in current split
+if source_node_type == 'disease':
+    # Get diseases in current split
+    if split == 'all':
+        split_diseases = disease_splits.drop_duplicates(subset='node_name')['node_name']
+    else:
+        split_diseases = disease_splits[disease_splits['disease_split_index'] == split]
+        split_diseases = split_diseases.drop_duplicates(subset='node_name')['node_name']
+    # Add checkbox to filter diseases
+    filter_diseases = st.checkbox("Filter diseases to current split?", value = False)
+# Select source node
+if source_node_type == 'disease' and filter_diseases:
+    # source_node_options = source_node_options[source_node_options.isin(split_diseases)]
+    source_node_options = split_diseases
+else:
+    source_node_options = kg_nodes[kg_nodes['node_type'] == source_node_type]['node_name']
+source_node = st.selectbox("Source Node", source_node_options,
+                           index = source_node_index)
+# Select target node type
+target_node_type_options = edge_types[edge_types.x_type == source_node_type].y_type.unique()
+target_node_type = st.selectbox("Target Node Type", target_node_type_options,
+                                format_func = lambda x: x.replace("_", " "), index = target_node_type_index)
+# Select relation
+relation_options = edge_types[(edge_types.x_type == source_node_type) & (edge_types.y_type == target_node_type)].relation.unique()
+relation = st.selectbox("Edge Type", relation_options,
+                        format_func = lambda x: x.replace("_", "-"), index = relation_index)
+# Button to submit query
+if st.button("Submit Query"):
+    # Check if model is available for split
+    model_avail = splits_df.loc[splits_df['node_index'] == st.session_state.split, 'available'].values[0]
+    if not model_avail:
+        st.error("A trained model is not yet available for this disease split. Please select another disease split for which a trained model is available.", icon="🚨")
+    else:
+        # Save query to session state
+        st.session_state.query = {
+            "source_node_type": source_node_type,
+            "source_node": source_node,
+            "target_node_type": target_node_type,
+            "relation": relation
+        }
+        # Save query options to session state
+        st.session_state.query_options = {
+            "source_node_type": list(source_node_type_options),
+            "source_node": list(source_node_options),
+            "target_node_type": list(target_node_type_options),
+            "relation": list(relation_options)
+        }
+        # Delete validation from session state
+        if "validation" in st.session_state:
+            del st.session_state.validation
+        # # Write query to console
+        # st.write("Current Query:")
+        # st.write(st.session_state.query)
+        st.write("Query submitted.")
+        # Switch to the Predict page
+        st.switch_page("pages/predict.py")
+st.subheader("Knowledge Graph", divider = "red")
+display_data = kg_nodes[['node_id', 'node_type', 'node_name', 'node_source']].copy()
+display_data = display_data.rename(columns = {'node_id': 'ID', 'node_type': 'Type', 'node_name': 'Name', 'node_source': 'Database'})
+st.dataframe(display_data, use_container_width = True, hide_index = True)

pages/predict.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import streamlit as st
+from menu import menu_with_redirect
+# Standard imports
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Path manipulation
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+# Plotting
+import matplotlib.pyplot as plt
+plt.rcParams['font.sans-serif'] = 'Arial'
+# Custom and other imports
+import project_config
+from utils import capitalize_after_slash, load_kg
+# Redirect to app.py if not logged in, otherwise show the navigation menu
+menu_with_redirect()
+# Header
+st.image(str(project_config.MEDIA_DIR / 'predict_header.svg'), use_column_width=True)
+# Main content
+# st.markdown(f"Hello, {st.session_state.name}!")
+st.subheader(f"{capitalize_after_slash(st.session_state.query['target_node_type'])} Search", divider = "blue")
+# Print current query
+st.markdown(f"**Query:** {st.session_state.query['source_node'].replace('_', ' ')} ➡️ {st.session_state.query['relation'].replace('_', '-')} ➡️ {st.session_state.query['target_node_type'].replace('_', ' ')}")
+@st.cache_data(show_spinner = 'Downloading AI model...')
+def get_embeddings():
+    # # Get checkpoint name
+    # best_ckpt = "2024_05_15_13_05_33_epoch=2-step=40383"
+    # # Get paths to embeddings, relation weights, and edge types
+    # # with st.spinner('Downloading AI model...'):
+    # embed_path = hf_hub_download(repo_id="ayushnoori/galaxy",
+    #                             filename=(best_ckpt + "-thresh=4000_embeddings.pt"),
+    #                             token=st.secrets["HF_TOKEN"])
+    # relation_weights_path = hf_hub_download(repo_id="ayushnoori/galaxy",
+    #                                         filename=(best_ckpt + "_relation_weights.pt"),
+    #                                         token=st.secrets["HF_TOKEN"])
+    # edge_types_path = hf_hub_download(repo_id="ayushnoori/galaxy",
+    #                                     filename=(best_ckpt + "_edge_types.pt"),
+    #                                     token=st.secrets["HF_TOKEN"])
+    # Get split name
+    split = st.session_state.split
+    avail_models = st.session_state.avail_models
+    # Get model name from avail_models
+    embed_name = avail_models[avail_models['test'] == 'all']['file'].values[0]
+    relation_weights_name = embed_name.replace('_embeddings.pt', '_relation_weights.pt')
+    edge_types_name = embed_name.replace('_embeddings.pt', '_edge_types.pt')
+    # Convert to paths
+    embed_path = project_config.MODEL_DIR / 'embeddings' / embed_name
+    relation_weights_path = project_config.MODEL_DIR / 'embeddings' / relation_weights_name
+    edge_types_path = project_config.MODEL_DIR / 'embeddings' / edge_types_name
+    return embed_path, relation_weights_path, edge_types_path
+@st.cache_data(show_spinner = 'Loading AI model...')
+def load_embeddings(embed_path, relation_weights_path, edge_types_path):
+    # Load embeddings, relation weights, and edge types
+    # with st.spinner('Loading AI model...'):
+    embeddings = torch.load(embed_path)
+    relation_weights = torch.load(relation_weights_path)
+    edge_types = torch.load(edge_types_path)
+    return embeddings, relation_weights, edge_types
+# Load knowledge graph and embeddings
+kg_nodes = load_kg()
+embed_path, relation_weights_path, edge_types_path = get_embeddings()
+embeddings, relation_weights, edge_types = load_embeddings(embed_path, relation_weights_path, edge_types_path)
+# # Print source node type
+# st.write(f"Source Node Type: {st.session_state.query['source_node_type']}")
+# # Print source node
+# st.write(f"Source Node: {st.session_state.query['source_node']}")
+# # Print relation
+# st.write(f"Edge Type: {st.session_state.query['relation']}")
+# # Print target node type
+# st.write(f"Target Node Type: {st.session_state.query['target_node_type']}")
+# Compute predictions
+with st.spinner('Computing predictions...'):
+    source_node_type = st.session_state.query['source_node_type']
+    source_node = st.session_state.query['source_node']
+    relation = st.session_state.query['relation']
+    target_node_type = st.session_state.query['target_node_type']
+    # Get source node index
+    src_index = kg_nodes[(kg_nodes.node_type == source_node_type) & (kg_nodes.node_name == source_node)].node_index.values[0]
+    # Get relation index
+    edge_type_index = [i for i, etype in enumerate(edge_types) if etype == (source_node_type, relation, target_node_type)][0]
+    # Get target nodes indices
+    target_nodes = kg_nodes[kg_nodes.node_type == target_node_type].copy()
+    dst_indices = target_nodes.node_index.values
+    src_indices = np.repeat(src_index, len(dst_indices))
+    # Retrieve cached embeddings and apply activation function
+    src_embeddings = embeddings[src_indices]
+    dst_embeddings = embeddings[dst_indices]
+    src_embeddings = F.leaky_relu(src_embeddings)
+    dst_embeddings = F.leaky_relu(dst_embeddings)
+    # Get relation weights
+    rel_weights = relation_weights[edge_type_index]
+    # Compute weighted dot product
+    scores = torch.sum(src_embeddings * rel_weights * dst_embeddings, dim = 1)
+    scores = torch.sigmoid(scores)
+    # Add scores to dataframe
+    target_nodes['score'] = scores.detach().numpy()
+    target_nodes = target_nodes.sort_values(by = 'score', ascending = False)
+    target_nodes['rank'] = np.arange(1, target_nodes.shape[0] + 1)
+    # Rename columns
+    display_data = target_nodes[['rank', 'node_id', 'node_name', 'score', 'node_source']].copy()
+    display_data = display_data.rename(columns = {'rank': 'Rank', 'node_id': 'ID', 'node_name': 'Name', 'score': 'Score', 'node_source': 'Database'})
+    # Define dictionary mapping node types to database URLs
+    map_dbs = {
+        'gene/protein': lambda x: f"https://ncbi.nlm.nih.gov/gene/?term={x}",
+        'drug': lambda x: f"https://go.drugbank.com/drugs/{x}",
+        'effect/phenotype': lambda x: f"https://hpo.jax.org/app/browse/term/HP:{x.zfill(7)}", # pad with 0s to 7 digits
+        'disease': lambda x: x, # MONDO
+        # pad with 0s to 7 digits
+        'biological_process': lambda x: f"https://amigo.geneontology.org/amigo/term/GO:{x.zfill(7)}",
+        'molecular_function': lambda x: f"https://amigo.geneontology.org/amigo/term/GO:{x.zfill(7)}",
+        'cellular_component': lambda x: f"https://amigo.geneontology.org/amigo/term/GO:{x.zfill(7)}",
+        'exposure': lambda x: f"https://ctdbase.org/detail.go?type=chem&acc={x}",
+        'pathway': lambda x: f"https://reactome.org/content/detail/{x}",
+        'anatomy': lambda x: x,
+    }
+    # Get name of database
+    display_database = display_data['Database'].values[0]
+    # Add URLs to database column
+    display_data['Database'] = display_data.apply(lambda x: map_dbs[target_node_type](x['ID']), axis = 1)
+    # Check if validation data exists
+    if 'validation' in st.session_state:
+        # Checkbox to allow reverse edges
+        show_val = st.checkbox("Show ground truth validation?", value = False)
+        if show_val:
+            # Get validation data
+            val_results = st.session_state.validation.copy()
+            # Merge with predictions
+            val_display_data = pd.merge(display_data, val_results, left_on = 'ID', right_on = 'y_id', how='left')
+            val_display_data = val_display_data.fillna(0).drop(columns='y_id')
+            # Get new columns
+            val_relations = val_display_data.columns.difference(display_data.columns).tolist()
+            # Replace 0 with blank and 1 with check emoji in new columns
+            for col in val_relations:
+                val_display_data[col] = val_display_data[col].replace({0: '', 1: '✅'})
+            # Define a function to apply styles
+            def style_val(val):
+                if val == '✅':
+                    return 'background-color: #C2EABD;' #  text-align: center;
+                return 'background-color: #F5F5F5;' # text-align: center;
+    else:
+        show_val = False
+    # NODE SEARCH
+    # Use multiselect to search for specific nodes
+    selected_nodes = st.multiselect(f"Search for specific {target_node_type.replace('_', ' ')} nodes to determine their ranking.",
+                                    display_data.Name, placeholder = "Type to search...")
+    # Filter nodes
+    if len(selected_nodes) > 0:
+        if show_val:
+            # selected_display_data = val_display_data[val_display_data.Name.isin(selected_nodes)]
+            selected_display_data = val_display_data[val_display_data.Name.isin(selected_nodes)].copy()
+            selected_display_data = selected_display_data.reset_index(drop=True).style.map(style_val, subset=val_relations)
+        else:
+            selected_display_data = display_data[display_data.Name.isin(selected_nodes)].copy()
+            selected_display_data = selected_display_data.reset_index(drop=True)
+        st.markdown(f"Out of {target_nodes.shape[0]} {target_node_type} nodes, the selected nodes rank as follows:")
+        selected_display_data_with_rank = selected_display_data.copy()
+        selected_display_data_with_rank['Rank'] = selected_display_data_with_rank['Rank'].apply(lambda x: f"{x} (top {(100*x/target_nodes.shape[0]):.2f}% of predictions)")
+        # Show filtered nodes
+        if target_node_type not in ['disease', 'anatomy']:
+            st.dataframe(selected_display_data_with_rank, use_container_width = True, hide_index = True,
+                        column_config={"Database": st.column_config.LinkColumn(width = "small",
+                                                                               help = "Click to visit external database.",
+                                                                               display_text = display_database)})
+        else:
+            st.dataframe(selected_display_data_with_rank, use_container_width = True)
+        # Show plot
+        st.markdown(f"In the plot below, the dashed lines represent the rank of the selected {target_node_type} nodes across all predictions for {source_node}.")
+        # Checkbox to show text labels
+        show_labels = st.checkbox("Show Text Labels?", value = False)
+        # Plot rank vs. score using matplotlib
+        fig, ax = plt.subplots(figsize = (10, 6))
+        ax.plot(display_data['Rank'], display_data['Score'], color = 'black', linewidth = 1.5, zorder = 2)
+        ax.set_xlabel('Rank', fontsize = 12)
+        ax.set_ylabel('Score', fontsize = 12)
+        ax.set_xlim(1, display_data['Rank'].max())
+        # Get color palette
+        # palette = plt.cm.get_cmap('tab10', len(selected_display_data))
+        palette = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
+        # Add vertical line for selected nodes
+        for i, node in selected_display_data.iterrows():
+            ax.scatter(node['Rank'], node['Score'], color = palette[i], zorder=3)
+            ax.axvline(node['Rank'], color = palette[i], linestyle = '--', linewidth = 1.5, label = node['Name'], zorder=3)
+            if show_labels:
+                ax.text(node['Rank'] + 100, node['Score'], node['Name'], fontsize = 10, color = palette[i], zorder=3)
+        # Add legend
+        ax.legend(loc = 'upper right', fontsize = 10)
+        ax.grid(alpha = 0.2, zorder=0)
+        st.pyplot(fig)
+    # FULL RESULTS
+    # Show top ranked nodes
+    st.subheader("Model Predictions", divider = "blue")
+    top_k = st.slider('Select number of top ranked nodes to show.', 1, target_nodes.shape[0], min(500, target_nodes.shape[0]))
+    # Show full results
+    # full_results = val_display_data.iloc[:top_k] if show_val else display_data.iloc[:top_k]
+    full_results = val_display_data.iloc[:top_k].style.map(style_val, subset=val_relations) if show_val else display_data.iloc[:top_k]
+    if target_node_type not in ['disease', 'anatomy']:
+        st.dataframe(full_results, use_container_width = True, hide_index = True,
+                    column_config={"Database": st.column_config.LinkColumn(width = "small",
+                                                                           help = "Click to visit external database.",
+                                                                           display_text = display_database)})
+    else:
+        st.dataframe(full_results, use_container_width = True, hide_index = True,)
+    # Save to session state
+    st.session_state.predictions = display_data
+    st.session_state.display_database = display_database

pages/split.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import streamlit as st
+from menu import menu_with_redirect
+# Standard imports
+import numpy as np
+import pandas as pd
+# Path manipulation
+import os
+from pathlib import Path
+# Plotting
+import matplotlib.pyplot as plt
+plt.rcParams['font.sans-serif'] = 'Arial'
+# Custom and other imports
+import project_config
+# Redirect to app.py if not logged in, otherwise show the navigation menu
+menu_with_redirect()
+# Back button with emoji
+if st.button("◀️ Back"):
+    st.switch_page("pages/input.py")
+# Get metadata from session state
+split = st.session_state.split
+splits_df = st.session_state.splits_df
+with st.spinner('Loading disease splits...'):
+    # Read disease splits
+    disease_split_nodes = pd.read_csv(project_config.DATA_DIR / 'disease_splits' / 'disease_splits.csv', dtype = {'disease_split_index': str})
+    # If split is all
+    if split == 'all':
+        disease_split_nodes = disease_split_nodes[['node_index', 'node_name', 'embedding_score', 'levenshtein_score', 'neighborhood_score', 'method', 'disease_split']]
+        disease_split_nodes = disease_split_nodes.rename(columns = {'node_index': 'Node ID', 'node_name': 'Disease', 'embedding_score': 'Embedding Score', 'levenshtein_score': 'Levenshtein Score', 'neighborhood_score': 'Neighborhood Score', 'method': 'Method', 'disease_split': 'Disease Split'})
+    else:
+        disease_split_nodes = disease_split_nodes[disease_split_nodes['disease_split_index'] == split]
+        disease_split_nodes = disease_split_nodes[['node_index', 'node_name', 'embedding_score', 'levenshtein_score', 'neighborhood_score', 'method']]
+        disease_split_nodes = disease_split_nodes.rename(columns = {'node_index': 'Node ID', 'node_name': 'Disease', 'embedding_score': 'Embedding Score', 'levenshtein_score': 'Levenshtein Score', 'neighborhood_score': 'Neighborhood Score', 'method': 'Method'})
+    # Read disease split edges
+    disease_split_edges = pd.read_csv(project_config.DATA_DIR / 'disease_splits' / 'split_edges' / f'{split}.csv')
+    # Subset and rename columns
+    disease_split_edges = disease_split_edges[['relation', 'x_index', 'x_type', 'x_name', 'y_index', 'y_type', 'y_name']]
+    disease_split_edges['relation'] = disease_split_edges['relation'].str.replace('_', ' ').str.title()
+    disease_split_edges = disease_split_edges.rename(columns = {'relation': 'Relation', 'x_index': 'Source ID', 'x_type': 'Source Type', 'x_name': 'Source Name', 'y_index': 'Target ID', 'y_type': 'Target Type', 'y_name': 'Target Name'})
+st.subheader("Nodes in Disease Split", divider = "blue")
+st.markdown(f"**Disease Split:** {splits_df[splits_df['node_index'] == split]['node_name'].values[0]}")
+st.markdown(f"**Number of Nodes:** {disease_split_nodes.shape[0]}")
+# Show as dataframe
+st.dataframe(disease_split_nodes, use_container_width = True, hide_index = True)
+st.markdown("Below, we show the number of nodes by method of inclusion in the disease split.")
+# Plotting the bar plot
+method_counts = disease_split_nodes['Method'].value_counts().reset_index()
+method_counts.columns = ['Method', 'Count']
+method_counts['Method Length'] = method_counts['Method'].apply(len)
+method_counts = method_counts.sort_values('Method Length')
+# Plotting the bar plot
+plt.figure(figsize=(10, 6))
+bars = plt.bar(method_counts['Method'], method_counts['Count'], color='#B8D4F7', edgecolor='black')
+plt.xlabel('Method', fontsize=16, fontweight='bold')
+plt.ylabel('Count', fontsize=16, fontweight='bold')
+plt.xticks(rotation=45, ha='right', fontsize=12)
+plt.tight_layout()
+# Adding labels on top of each bar
+for bar in bars:
+    yval = bar.get_height()
+    plt.text(bar.get_x() + bar.get_width()/2.0, yval, int(yval), va='bottom', fontsize=12)
+plt.ylim(0, max(method_counts['Count'])*1.1)
+# Show plot
+st.pyplot(plt)
+st.subheader("Edges in Disease Split", divider = "green")
+st.markdown(f"**Number of Edges:** {disease_split_edges.shape[0]}")
+# Show as dataframe
+st.dataframe(disease_split_edges, use_container_width = True, hide_index = True)
+if disease_split_edges.shape[0] > 0:
+    # Make bar plot of number of edges by relation
+    st.markdown("Below, we show the number of edges by relation in the disease split.")
+    relation_counts = disease_split_edges['Relation'].value_counts().reset_index()
+    relation_counts.columns = ['Relation', 'Count']
+    # Plotting the bar plot
+    plt.figure(figsize=(10, 6))
+    bars = plt.bar(relation_counts['Relation'], relation_counts['Count'], color='#C1ECC5', edgecolor='black')
+    plt.xlabel('Relation', fontsize=16, fontweight='bold')
+    plt.ylabel('Count', fontsize=16, fontweight='bold')
+    plt.xticks(rotation=45, ha='right', fontsize=12)
+    plt.tight_layout()
+    # Adding labels on top of each bar
+    for bar in bars:
+        yval = bar.get_height()
+        plt.text(bar.get_x() + bar.get_width()/2.0, yval, int(yval), va='bottom', fontsize=12)
+    plt.ylim(0, max(relation_counts['Count'])*1.1)
+    # Show plot
+    st.pyplot(plt)

pages/validate.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import streamlit as st
+from menu import menu_with_redirect
+# Standard imports
+import numpy as np
+import pandas as pd
+# Path manipulation
+from pathlib import Path
+# Plotting
+import matplotlib.pyplot as plt
+plt.rcParams['font.sans-serif'] = 'Arial'
+import matplotlib.colors as mcolors
+# Custom and other imports
+import project_config
+from utils import load_kg, load_kg_edges
+# Redirect to app.py if not logged in, otherwise show the navigation menu
+menu_with_redirect()
+# Header
+st.image(str(project_config.MEDIA_DIR / 'validate_header.svg'), use_column_width=True)
+# Main content
+# st.markdown(f"Hello, {st.session_state.name}!")
+st.subheader("Validate Predictions", divider = "green")
+# Print current query
+st.markdown(f"**Query:** {st.session_state.query['source_node'].replace('_', ' ')} ➡️ {st.session_state.query['relation'].replace('_', '-')} ➡️ {st.session_state.query['target_node_type'].replace('_', ' ')}")
+# Coming soon
+# st.write("Coming soon...")
+source_node_type = st.session_state.query['source_node_type']
+source_node = st.session_state.query['source_node']
+relation = st.session_state.query['relation']
+target_node_type = st.session_state.query['target_node_type']
+predictions = st.session_state.predictions
+kg_nodes = load_kg()
+kg_edges = load_kg_edges()
+# Convert tuple to hex
+def rgba_to_hex(rgba):
+    return mcolors.to_hex(rgba[:3])
+with st.spinner('Searching known relationships...'):
+    # Subset existing edges
+    edge_subset = kg_edges[(kg_edges.x_type == source_node_type) & (kg_edges.x_name == source_node)]
+    edge_subset = edge_subset[edge_subset.y_type == target_node_type]
+    # Merge edge subset with predictions
+    edges_in_kg = pd.merge(predictions, edge_subset[['relation', 'y_id']], left_on = 'ID', right_on = 'y_id', how = 'right')
+    edges_in_kg = edges_in_kg.sort_values(by = 'Score', ascending = False)
+    edges_in_kg = edges_in_kg.drop(columns = 'y_id')
+    # Rename relation to ground-truth
+    edges_in_kg = edges_in_kg[['relation'] + [col for col in edges_in_kg.columns if col != 'relation']]
+    edges_in_kg = edges_in_kg.rename(columns = {'relation': 'Known Relation'})
+# If there exist edges in KG
+if len(edges_in_kg) > 0:
+    with st.spinner('Saving validation results...'):
+        # Cast long to wide
+        val_results = edge_subset[['relation', 'y_id']].pivot_table(index='y_id', columns='relation', aggfunc='size', fill_value=0)
+        val_results = (val_results > 0).astype(int).reset_index()
+        val_results.columns = [val_results.columns[0]] + [x.replace('_', ' ').title() for x in val_results.columns[1:]]
+        # Save validation results to session state
+        st.session_state.validation = val_results
+    with st.spinner('Plotting known relationships...'):
+        # Define a color map for different relations
+        color_map = plt.get_cmap('tab10')
+        # Group by relation and create separate plots
+        relations = edges_in_kg['Known Relation'].unique()
+        for idx, relation in enumerate(relations):
+            relation_data = edges_in_kg[edges_in_kg['Known Relation'] == relation]
+            # Get a color from the color map
+            color = color_map(idx % color_map.N)
+            fig, ax = plt.subplots(figsize=(10, 3))
+            ax.plot(predictions['Rank'], predictions['Score'])
+            ax.set_xlabel('Rank', fontsize=12)
+            ax.set_ylabel('Score', fontsize=12)
+            ax.set_xlim(1, predictions['Rank'].max())
+            for i, node in relation_data.iterrows():
+                ax.axvline(node['Rank'], color=color, linestyle='--', label=node['Name'])
+                # ax.text(node['Rank'] + 100, node['Score'], node['Name'], fontsize=10, color=color)
+            # ax.set_title(f'{relation.replace("_", "-")}')
+            # ax.legend()
+            color_hex = rgba_to_hex(color)
+            # Write header in color of relation
+            st.markdown(f"<h3 style='color:{color_hex}'>{relation.replace('_', ' ').title()}</h2>", unsafe_allow_html=True)
+            # Show plot
+            st.pyplot(fig)
+            # Drop known relation column
+            relation_data = relation_data.drop(columns = 'Known Relation')
+            if target_node_type not in ['disease', 'anatomy']:
+                st.dataframe(relation_data, use_container_width=True,
+                            column_config={"Database": st.column_config.LinkColumn(width = "small",
+                                                                                    help = "Click to visit external database.",
+                                                                                    display_text = st.session_state.display_database)})
+            else:
+                st.dataframe(relation_data, use_container_width=True)
+else:
+    st.error('No ground truth relationships found for the given query in the knowledge graph.', icon="✖️")

project_config.py ADDED Viewed

	@@ -0,0 +1,44 @@

+'''
+PROJECT CONFIGURATION FILE
+This file contains the configuration variables for the project. The variables are used
+in the other scripts to define the paths to the data and results directories. The variables
+are also used to set the random seed for reproducibility.
+'''
+# Import libraries
+from pathlib import Path
+import socket
+import getpass
+def check_internet_connection():
+    try:
+        # Connect to one of the DNS servers
+        socket.create_connection(("8.8.8.8", 53), timeout=5)
+        return True
+    except OSError:
+        return False
+def check_local_machine():
+    hostname = socket.gethostname()
+    username = getpass.getuser()
+    return hostname, username
+# Define global variable indicating whether on VDI or not
+VDI = not check_internet_connection()
+print(f"VDI: {VDI}")
+# Define global variable to check if running locally
+hostname, username = check_local_machine()
+LOCAL = True if username == 'an583' else False
+# Define HF repo variable
+HF_REPO = 'ayushnoori/clinical-drug-repurposing'
+# Define project configuration variables
+PROJECT_DIR = Path(__file__).resolve().parent
+DATA_DIR = PROJECT_DIR / 'data'
+AUTH_DIR = PROJECT_DIR / 'auth'
+MODEL_DIR = PROJECT_DIR / 'models'
+MEDIA_DIR = PROJECT_DIR / 'media'
+SEED = 42

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+numpy
+pandas
+scikit-learn
+matplotlib
+seaborn
+pathlib
+torch
+altair<5
+gspread
+oauth2client
+huggingface_hub

sync_data.sh ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/bin/bash
+# Set permissions with chmod +x sync_data.sh
+# Run with ./sync_data.sh
+# Ask the user for the environment
+echo "Would you like to sync results from O2 or Kempner?"
+echo "1) O2"
+echo "2) Kempner"
+read -p "Enter your choice (1 or 2): " env_choice
+# Ask the user which results folder to sync
+echo "Which results folder do you want to sync?"
+echo "1) data: disease splits"
+echo "2) disease split models: checkpoints"
+echo "3) disease split models: embeddings"
+read -p "Enter your choice (1, 2, or 3): " folder_choice
+# Map user input to folder names
+case $folder_choice in
+    1) SRC_FOLDER="Data/DrugKG/2_harmonize_KG/disease_splits";;
+    2) SRC_FOLDER="Results/GALAXY/disease_splits/checkpoints";;
+    3) SRC_FOLDER="Results/GALAXY/disease_splits/embeddings";;
+    *) echo "Invalid folder choice. Please enter 1, 2, or 3."; exit 1;;
+esac
+case $env_choice in
+    1) SRC_DIR="[email protected]:/n/data1/hms/dbmi/zitnik/lab/users/an252/NeuroKG/neuroKG/$SRC_FOLDER";;
+    2) SRC_DIR="[email protected]:/n/holylabs/LABS/mzitnik_lab/Users/anoori/neuroKG/$SRC_FOLDER";;
+    *) echo "Invalid source server choice. Please enter 1 or 2."; exit 1;;
+esac
+# Map user input to destination folder names
+case $folder_choice in
+    1) DST_DIR="data/disease_splits";;
+    2) DST_DIR="models/checkpoints";; # Don't need checkpoints for this application
+    3) DST_DIR="models/embeddings";;
+    *) echo "Invalid folder choice. Please enter 1, 2, or 3."; exit 1;;
+esac
+# Sync source and destination folders with specific file types for checkpoints or embeddings
+# Note, local files not present in the source will be deleted
+if [[ $folder_choice -eq 2 || $folder_choice -eq 3 ]]; then
+    echo "Syncing only .ckpt or .pt files from $SRC_DIR to $DST_DIR..."
+    rsync -avz -e ssh --include="*.ckpt" --include="*.pt" --exclude="*" --delete $SRC_DIR/ $DST_DIR
+else
+    echo "Syncing $SRC_DIR to $DST_DIR..."
+    rsync -avz -e ssh --delete $SRC_DIR/ $DST_DIR
+fi
+echo "Synchronization complete."

utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import streamlit as st
+import pandas as pd
+import project_config
+import base64
+@st.cache_data(show_spinner = 'Loading knowledge graph nodes...')
+def load_kg():
+    # with st.spinner('Loading knowledge graph...'):
+    kg_nodes = pd.read_csv(project_config.DATA_DIR / 'kg_nodes.csv', dtype = {'node_index': int}, low_memory = False)
+    return kg_nodes
+@st.cache_data(show_spinner = 'Loading knowledge graph edges...')
+def load_kg_edges():
+    # with st.spinner('Loading knowledge graph...'):
+    kg_edges = pd.read_csv(project_config.DATA_DIR / 'kg_edges.csv', dtype = {'edge_index': int, 'x_index': int, 'y_index': int}, low_memory = False)
+    return kg_edges
+def capitalize_after_slash(s):
+    # Split the string by slashes first
+    parts = s.split('/')
+    # Capitalize each part separately
+    capitalized_parts = [part.title() for part in parts]
+    # Rejoin the parts with slashes
+    capitalized_string = '/'.join(capitalized_parts).replace('_', ' ')
+    return capitalized_string