Spaces:

ayushnoori
/

clinical-drug-repurposing

Sleeping

App Files Files Community

ayushnoori commited on Jun 30, 2024

Commit

b7df334

1 Parent(s): 11b8c2d

Significant update with multi-relation comparison across app

Browse files

Files changed (4) hide show

pages/input.py +14 -10
pages/predict.py +112 -89
pages/validate.py +148 -122
utils.py +31 -1

pages/input.py CHANGED Viewed

@@ -173,21 +173,21 @@ if "query" not in st.session_state:
     source_node_type_index = 0
     source_node_index = 0
     target_node_type_index = 0
-    relation_index = 0
     filter_diseases_value = False
     if st.session_state.team == "Clalit":
         source_node_type_index = 2
         source_node_index = 0
         target_node_type_index = 3
-        relation_index = 2
         filter_diseases_value = True
 else:
     source_node_type_index = st.session_state.query_options['source_node_type'].index(st.session_state.query['source_node_type'])
     source_node_index = st.session_state.query_options['source_node'].index(st.session_state.query['source_node'])
     target_node_type_index = st.session_state.query_options['target_node_type'].index(st.session_state.query['target_node_type'])
-    relation_index = st.session_state.query_options['relation'].index(st.session_state.query['relation'])
     filter_diseases_value = st.session_state.query_options['filter_diseases']
 # Define error catching function
@@ -237,11 +237,11 @@ target_node_type = st.selectbox("Target Node Type", target_node_type_options,
                                 format_func = lambda x: x.replace("_", " "),
                                 index = catch_index_error(target_node_type_index, target_node_type_options))
-# Select relation
-relation_options = edge_types[(edge_types.x_type == source_node_type) & (edge_types.y_type == target_node_type)].relation.unique()
-relation = st.selectbox("Edge Type", relation_options,
-                        format_func = lambda x: x.replace("_", "-"),
-                        index = catch_index_error(relation_index, relation_options))
 # Button to submit query
 if st.button("Submit Query"):
@@ -259,7 +259,7 @@ if st.button("Submit Query"):
             "source_node_type": source_node_type,
             "source_node": source_node,
             "target_node_type": target_node_type,
-            "relation": relation
         }
         # Save query options to session state
@@ -267,7 +267,7 @@ if st.button("Submit Query"):
             "source_node_type": list(source_node_type_options),
             "source_node": list(source_node_options),
             "target_node_type": list(target_node_type_options),
-            "relation": list(relation_options),
             "filter_diseases": filter_diseases
         }
@@ -275,6 +275,10 @@ if st.button("Submit Query"):
         if "validation" in st.session_state:
             del st.session_state.validation
         # # Write query to console
         # st.write("Current Query:")
         # st.write(st.session_state.query)

     source_node_type_index = 0
     source_node_index = 0
     target_node_type_index = 0
+    # relation_index = 0
     filter_diseases_value = False
     if st.session_state.team == "Clalit":
         source_node_type_index = 2
         source_node_index = 0
         target_node_type_index = 3
+        # relation_index = 2
         filter_diseases_value = True
 else:
     source_node_type_index = st.session_state.query_options['source_node_type'].index(st.session_state.query['source_node_type'])
     source_node_index = st.session_state.query_options['source_node'].index(st.session_state.query['source_node'])
     target_node_type_index = st.session_state.query_options['target_node_type'].index(st.session_state.query['target_node_type'])
+    # relation_index = st.session_state.query_options['relation'].index(st.session_state.query['relation'])
     filter_diseases_value = st.session_state.query_options['filter_diseases']
 # Define error catching function
                                 format_func = lambda x: x.replace("_", " "),
                                 index = catch_index_error(target_node_type_index, target_node_type_options))
+# # Select relation
+# relation_options = edge_types[(edge_types.x_type == source_node_type) & (edge_types.y_type == target_node_type)].relation.unique()
+# relation = st.selectbox("Edge Type", relation_options,
+#                         format_func = lambda x: x.replace("_", "-"),
+#                         index = catch_index_error(relation_index, relation_options))
 # Button to submit query
 if st.button("Submit Query"):
             "source_node_type": source_node_type,
             "source_node": source_node,
             "target_node_type": target_node_type,
+            # "relation": relation
         }
         # Save query options to session state
             "source_node_type": list(source_node_type_options),
             "source_node": list(source_node_options),
             "target_node_type": list(target_node_type_options),
+            # "relation": list(relation_options),
             "filter_diseases": filter_diseases
         }
         if "validation" in st.session_state:
             del st.session_state.validation
+        # Delete selected nodes from session state
+        if "selected_nodes" in st.session_state:
+            del st.session_state.selected_nodes
         # # Write query to console
         # st.write("Current Query:")
         # st.write(st.session_state.query)

pages/predict.py CHANGED Viewed

@@ -18,7 +18,7 @@ plt.rcParams['font.sans-serif'] = 'Arial'
 # Custom and other imports
 import project_config
-from utils import capitalize_after_slash, load_kg
 # Redirect to app.py if not logged in, otherwise show the navigation menu
 menu_with_redirect()
@@ -29,10 +29,9 @@ st.image(str(project_config.MEDIA_DIR / 'predict_header.svg'), use_column_width=
 # Main content
 # st.markdown(f"Hello, {st.session_state.name}!")
-st.subheader(f"{capitalize_after_slash(st.session_state.query['target_node_type'])} Search", divider = "blue")
 # Print current query
-st.markdown(f"**Query:** {st.session_state.query['source_node'].replace('_', ' ')} ➡️ {st.session_state.query['relation'].replace('_', '-')} ➡️ {st.session_state.query['target_node_type'].replace('_', ' ')}")
 # Print split
 split = st.session_state.split
@@ -48,7 +47,7 @@ def get_embeddings():
     # best_ckpt = "2024_05_15_13_05_33_epoch=2-step=40383"
     # Get split name
-    split = st.session_state.split
     avail_models = st.session_state.avail_models
     # Get model name from available models
@@ -79,6 +78,7 @@ def get_embeddings():
     return embed_path, relation_weights_path, edge_types_path
 @st.cache_data(show_spinner = 'Loading AI model...')
 def load_embeddings(embed_path, relation_weights_path, edge_types_path):
@@ -94,6 +94,7 @@ def load_embeddings(embed_path, relation_weights_path, edge_types_path):
 kg_nodes = load_kg()
 embed_path, relation_weights_path, edge_types_path = get_embeddings()
 embeddings, relation_weights, edge_types = load_embeddings(embed_path, relation_weights_path, edge_types_path)
 # # Print source node type
 # st.write(f"Source Node Type: {st.session_state.query['source_node_type']}")
@@ -107,67 +108,79 @@ embeddings, relation_weights, edge_types = load_embeddings(embed_path, relation_
 # # Print target node type
 # st.write(f"Target Node Type: {st.session_state.query['target_node_type']}")
-# Compute predictions
-with st.spinner('Computing predictions...'):
-    source_node_type = st.session_state.query['source_node_type']
-    source_node = st.session_state.query['source_node']
-    relation = st.session_state.query['relation']
-    target_node_type = st.session_state.query['target_node_type']
-    # Get source node index
-    src_index = kg_nodes[(kg_nodes.node_type == source_node_type) & (kg_nodes.node_name == source_node)].node_index.values[0]
-    # Get relation index
-    edge_type_index = [i for i, etype in enumerate(edge_types) if etype == (source_node_type, relation, target_node_type)][0]
-    # Get target nodes indices
     target_nodes = kg_nodes[kg_nodes.node_type == target_node_type].copy()
-    dst_indices = target_nodes.node_index.values
-    src_indices = np.repeat(src_index, len(dst_indices))
-    # Retrieve cached embeddings and apply activation function
-    src_embeddings = embeddings[src_indices]
-    dst_embeddings = embeddings[dst_indices]
-    src_embeddings = F.leaky_relu(src_embeddings)
-    dst_embeddings = F.leaky_relu(dst_embeddings)
-    # Get relation weights
-    rel_weights = relation_weights[edge_type_index]
-    # Compute weighted dot product
-    scores = torch.sum(src_embeddings * rel_weights * dst_embeddings, dim = 1)
-    scores = torch.sigmoid(scores)
-    # Add scores to dataframe
-    target_nodes['score'] = scores.detach().numpy()
-    target_nodes = target_nodes.sort_values(by = 'score', ascending = False)
-    target_nodes['rank'] = np.arange(1, target_nodes.shape[0] + 1)
-    # Rename columns
-    display_data = target_nodes[['rank', 'node_id', 'node_name', 'score', 'node_source']].copy()
-    display_data = display_data.rename(columns = {'rank': 'Rank', 'node_id': 'ID', 'node_name': 'Name', 'score': 'Score', 'node_source': 'Database'})
-    # Define dictionary mapping node types to database URLs
-    map_dbs = {
-        'gene/protein': lambda x: f"https://ncbi.nlm.nih.gov/gene/?term={x}",
-        'drug': lambda x: f"https://go.drugbank.com/drugs/{x}",
-        'effect/phenotype': lambda x: f"https://hpo.jax.org/app/browse/term/HP:{x.zfill(7)}", # pad with 0s to 7 digits
-        'disease': lambda x: x, # MONDO
-        # pad with 0s to 7 digits
-        'biological_process': lambda x: f"https://amigo.geneontology.org/amigo/term/GO:{x.zfill(7)}",
-        'molecular_function': lambda x: f"https://amigo.geneontology.org/amigo/term/GO:{x.zfill(7)}",
-        'cellular_component': lambda x: f"https://amigo.geneontology.org/amigo/term/GO:{x.zfill(7)}",
-        'exposure': lambda x: f"https://ctdbase.org/detail.go?type=chem&acc={x}",
-        'pathway': lambda x: f"https://reactome.org/content/detail/{x}",
-        'anatomy': lambda x: x,
-    }
-    # Get name of database
-    display_database = display_data['Database'].values[0]
-    # Add URLs to database column
-    display_data['Database'] = display_data.apply(lambda x: map_dbs[target_node_type](x['ID']), axis = 1)
     # Check if validation data exists
     if 'validation' in st.session_state:
@@ -203,9 +216,12 @@ with st.spinner('Computing predictions...'):
     # NODE SEARCH
     # Use multiselect to search for specific nodes
-    selected_nodes = st.multiselect(f"Search for specific {target_node_type.replace('_', ' ')} nodes to determine their ranking.",
-                                    display_data.Name, placeholder = "Type to search...")
     # Filter nodes
     if len(selected_nodes) > 0:
@@ -213,7 +229,7 @@ with st.spinner('Computing predictions...'):
         if show_val:
             # selected_display_data = val_display_data[val_display_data.Name.isin(selected_nodes)]
             selected_display_data = val_display_data[val_display_data.Name.isin(selected_nodes)].copy()
-            selected_display_data = selected_display_data.reset_index(drop=True).style.map(style_val, subset=val_relations)
         else:
             selected_display_data = display_data[display_data.Name.isin(selected_nodes)].copy()
             selected_display_data = selected_display_data.reset_index(drop=True)
@@ -222,12 +238,15 @@ with st.spinner('Computing predictions...'):
         selected_display_data_with_rank = selected_display_data.copy()
         selected_display_data_with_rank['Rank'] = selected_display_data_with_rank['Rank'].apply(lambda x: f"{x} (top {(100*x/target_nodes.shape[0]):.2f}% of predictions)")
         # Show filtered nodes
         if target_node_type not in ['disease', 'anatomy']:
             st.dataframe(selected_display_data_with_rank, use_container_width = True, hide_index = True,
                         column_config={"Database": st.column_config.LinkColumn(width = "small",
-                                                                               help = "Click to visit external database.",
-                                                                               display_text = display_database)})
         else:
             st.dataframe(selected_display_data_with_rank, use_container_width = True)
@@ -260,30 +279,26 @@ with st.spinner('Computing predictions...'):
         ax.grid(alpha = 0.2, zorder=0)
         st.pyplot(fig)
     # FULL RESULTS
     # Show top ranked nodes
-    st.subheader("Model Predictions", divider = "blue")
     top_k = st.slider('Select number of top ranked nodes to show.', 1, target_nodes.shape[0], min(500, target_nodes.shape[0]))
     # Show full results
     # full_results = val_display_data.iloc[:top_k] if show_val else display_data.iloc[:top_k]
     full_results = val_display_data.iloc[:top_k].style.map(style_val, subset=val_relations) if show_val else display_data.iloc[:top_k]
     if target_node_type not in ['disease', 'anatomy']:
         st.dataframe(full_results, use_container_width = True, hide_index = True,
                     column_config={"Database": st.column_config.LinkColumn(width = "small",
-                                                                           help = "Click to visit external database.",
-                                                                           display_text = display_database)})
     else:
         st.dataframe(full_results, use_container_width = True, hide_index = True,)
-    # Save to session state
-    st.session_state.predictions = display_data
-    st.session_state.display_database = display_database
     # If validation not in session state
     if 'validation' not in st.session_state:
@@ -293,10 +308,15 @@ with st.spinner('Computing predictions...'):
             if st.button("Validate Predictions"):
                 st.switch_page("pages/validate.py")
-    ####################################################################################################
-    relation_options = st.session_state.query_options['relation']
     if len(relation_options) > 1:
@@ -316,11 +336,12 @@ with st.spinner('Computing predictions...'):
         with relation_1_col:
             relation_1 = st.selectbox("Select first relation:", relation_options,
-                                      format_func = lambda x: x.replace("_", "-"), index = relation_1_index)
         with relation_2_col:
-            relation_2 = st.selectbox("Select second relation:", relation_options,
-                                      format_func = lambda x: x.replace("_", "-"), index = relation_2_index)
         # Get relation index
         rel_1_index = [i for i, etype in enumerate(edge_types) if etype == (source_node_type, relation_1, target_node_type)][0]
@@ -355,18 +376,18 @@ with st.spinner('Computing predictions...'):
         target_nodes = target_nodes.sort_values(by = 'rel_1_score', ascending = False)
         target_nodes['rel_1_rank'] = np.arange(1, target_nodes.shape[0] + 1)
-        # Rename relations
-        relation_1 = relation_1.replace("_", " ").title()
-        relation_2 = relation_2.replace("_", " ").title()
         # Compute correlation coefficient of scores
         corr = target_nodes['rel_1_score'].corr(target_nodes['rel_2_score'])
         spearman_corr = target_nodes['rel_1_score'].corr(target_nodes['rel_2_score'], method = 'spearman')
-        st.markdown(f"The correlation coefficient between {relation_1} and {relation_2} scores is:")
         st.markdown(f"**Pearson's $r$:** {corr:.2f} (Score)")
         st.markdown(f"**Spearman's $\\rho$:** {spearman_corr:.2f} (Rank)")
         # Rename columns
         display_comp = target_nodes[['node_id', 'node_name', 'rel_1_rank', 'rel_2_rank', 'rel_1_score', 'rel_2_score', 'node_source']].copy()
         display_comp = display_comp.rename(columns = {
@@ -398,7 +419,7 @@ with st.spinner('Computing predictions...'):
         rel_2_min = target_nodes[rel_2_column].min()
         rel_1_max = target_nodes[rel_1_column].max()
         rel_2_max = target_nodes[rel_2_column].max()
-        ax.plot([0, rel_1_max], [0, rel_2_max], color = 'red',
                 linestyle = '--', zorder = 3) # label = 'Equal Rank',
         ax.set_xlim(rel_1_min, rel_1_max)
         ax.set_ylim(rel_2_min, rel_2_max)
@@ -448,7 +469,7 @@ with st.spinner('Computing predictions...'):
             st.dataframe(display_comp_styled, use_container_width = True, hide_index = True,
                         column_config={"Database": st.column_config.LinkColumn(width = "small",
                                                                             help = "Click to visit external database.",
-                                                                            display_text = display_database)})
         else:
@@ -456,4 +477,6 @@ with st.spinner('Computing predictions...'):
             st.dataframe(display_comp, use_container_width = True, hide_index = True,
                         column_config={"Database": st.column_config.LinkColumn(width = "small",
                                                                             help = "Click to visit external database.",
-                                                                            display_text = display_database)})

 # Custom and other imports
 import project_config
+from utils import capitalize_after_slash, load_kg, map_dbs, map_db_names
 # Redirect to app.py if not logged in, otherwise show the navigation menu
 menu_with_redirect()
 # Main content
 # st.markdown(f"Hello, {st.session_state.name}!")
 # Print current query
+# st.markdown(f"**Query:** {st.session_state.query['source_node'].replace('_', ' ')} ➡️ {st.session_state.query['relation'].replace('_', '-')} ➡️ {st.session_state.query['target_node_type'].replace('_', ' ')}")
+st.markdown(f"**Query:** {st.session_state.query['source_node'].replace('_', ' ')} ➡️ {st.session_state.query['target_node_type'].replace('_', ' ')}")
 # Print split
 split = st.session_state.split
     # best_ckpt = "2024_05_15_13_05_33_epoch=2-step=40383"
     # Get split name
+    # split = st.session_state.split
     avail_models = st.session_state.avail_models
     # Get model name from available models
     return embed_path, relation_weights_path, edge_types_path
 @st.cache_data(show_spinner = 'Loading AI model...')
 def load_embeddings(embed_path, relation_weights_path, edge_types_path):
 kg_nodes = load_kg()
 embed_path, relation_weights_path, edge_types_path = get_embeddings()
 embeddings, relation_weights, edge_types = load_embeddings(embed_path, relation_weights_path, edge_types_path)
+edge_types_df = pd.read_csv(project_config.DATA_DIR / 'kg_edge_types.csv')
 # # Print source node type
 # st.write(f"Source Node Type: {st.session_state.query['source_node_type']}")
 # # Print target node type
 # st.write(f"Target Node Type: {st.session_state.query['target_node_type']}")
+source_node_type = st.session_state.query['source_node_type']
+source_node = st.session_state.query['source_node']
+# relation = st.session_state.query['relation']
+target_node_type = st.session_state.query['target_node_type']
+# Get relation options
+relation_options = edge_types_df[(edge_types_df.x_type == source_node_type) & (edge_types_df.y_type == target_node_type)].relation.unique()
+# Add relation selector
+relation = st.selectbox("Relation Type", relation_options, format_func = lambda x: x.replace("_", "-"))
+display_dbs = {}
+# Get source node index
+src_index = kg_nodes[(kg_nodes.node_type == source_node_type) & (kg_nodes.node_name == source_node)].node_index.values[0]
+@st.experimental_fragment()
+def compute_scores():
+    # Compute predictions
+    with st.spinner('Computing predictions...'):
+        # Get target nodes indices
+        target_nodes = kg_nodes[kg_nodes.node_type == target_node_type].copy()
+        dst_indices = target_nodes.node_index.values
+        src_indices = np.repeat(src_index, len(dst_indices))
+        # Retrieve cached embeddings and apply activation function
+        src_embeddings = embeddings[src_indices]
+        dst_embeddings = embeddings[dst_indices]
+        src_embeddings = F.leaky_relu(src_embeddings)
+        dst_embeddings = F.leaky_relu(dst_embeddings)
+        for relation_i in relation_options:
+            # Get relation index
+            edge_type_index = [i for i, etype in enumerate(edge_types) if etype == (source_node_type, relation_i, target_node_type)][0]
+            # Get relation weights
+            rel_weights = relation_weights[edge_type_index]
+            # Compute weighted dot product
+            scores = torch.sum(src_embeddings * rel_weights * dst_embeddings, dim = 1)
+            scores = torch.sigmoid(scores).detach().numpy()
+            # Add scores to dataframe
+            target_nodes = kg_nodes[kg_nodes.node_type == target_node_type].copy()
+            target_nodes['score'] = scores
+            target_nodes = target_nodes.sort_values(by = 'score', ascending = False)
+            target_nodes['rank'] = np.arange(1, target_nodes.shape[0] + 1)
+            # Rename columns
+            display_data = target_nodes[['rank', 'node_id', 'node_name', 'score', 'node_source']].copy()
+            display_data = display_data.rename(columns = {'rank': 'Rank', 'node_id': 'ID', 'node_name': 'Name', 'score': 'Score', 'node_source': 'Database'})
+            # Add URLs to database column
+            display_data['Database'] = display_data.apply(lambda x: map_dbs[target_node_type](x['ID']), axis = 1)
+            # Save to display databases
+            display_dbs[relation_i] = display_data
+# Compute scores
+compute_scores()
+# Save to session state
+st.session_state.predictions_rel = display_dbs
+@st.experimental_fragment()
+def visualize_scores():
+    # Get values
     target_nodes = kg_nodes[kg_nodes.node_type == target_node_type].copy()
+    display_data = display_dbs[relation]
     # Check if validation data exists
     if 'validation' in st.session_state:
     # NODE SEARCH
+    st.subheader(f"{capitalize_after_slash(st.session_state.query['target_node_type'])} Search", divider = "blue")
     # Use multiselect to search for specific nodes
+    selected_nodes = st.multiselect(f"Search for specific {target_node_type.replace('_', ' ')} nodes to determine their rankings.",
+                                    display_data.Name, placeholder = "Type to search...", key = 'selected_nodes',
+                                    default = st.session_state.selected_nodes if 'selected_nodes' in st.session_state else None)
     # Filter nodes
     if len(selected_nodes) > 0:
         if show_val:
             # selected_display_data = val_display_data[val_display_data.Name.isin(selected_nodes)]
             selected_display_data = val_display_data[val_display_data.Name.isin(selected_nodes)].copy()
+            selected_display_data = selected_display_data.reset_index(drop=True)
         else:
             selected_display_data = display_data[display_data.Name.isin(selected_nodes)].copy()
             selected_display_data = selected_display_data.reset_index(drop=True)
         selected_display_data_with_rank = selected_display_data.copy()
         selected_display_data_with_rank['Rank'] = selected_display_data_with_rank['Rank'].apply(lambda x: f"{x} (top {(100*x/target_nodes.shape[0]):.2f}% of predictions)")
+        if show_val:
+            selected_display_data_with_rank = selected_display_data_with_rank.style.map(style_val, subset=val_relations)
         # Show filtered nodes
         if target_node_type not in ['disease', 'anatomy']:
             st.dataframe(selected_display_data_with_rank, use_container_width = True, hide_index = True,
                         column_config={"Database": st.column_config.LinkColumn(width = "small",
+                                                                                help = "Click to visit external database.",
+                                                                                display_text = map_db_names[target_node_type])})
         else:
             st.dataframe(selected_display_data_with_rank, use_container_width = True)
         ax.grid(alpha = 0.2, zorder=0)
         st.pyplot(fig)
     # FULL RESULTS
     # Show top ranked nodes
+    st.subheader(f"{relation.replace('_', ' ').title()} Predictions", divider = "blue")
     top_k = st.slider('Select number of top ranked nodes to show.', 1, target_nodes.shape[0], min(500, target_nodes.shape[0]))
     # Show full results
     # full_results = val_display_data.iloc[:top_k] if show_val else display_data.iloc[:top_k]
     full_results = val_display_data.iloc[:top_k].style.map(style_val, subset=val_relations) if show_val else display_data.iloc[:top_k]
     if target_node_type not in ['disease', 'anatomy']:
         st.dataframe(full_results, use_container_width = True, hide_index = True,
                     column_config={"Database": st.column_config.LinkColumn(width = "small",
+                                                                            help = "Click to visit external database.",
+                                                                            display_text = map_db_names[target_node_type])})
     else:
         st.dataframe(full_results, use_container_width = True, hide_index = True,)
     # If validation not in session state
     if 'validation' not in st.session_state:
             if st.button("Validate Predictions"):
                 st.switch_page("pages/validate.py")
+visualize_scores()
+####################################################################################################
+# relation_options = st.session_state.query_options['relation']
+@st.experimental_fragment()
+def compare_scores():
     if len(relation_options) > 1:
         with relation_1_col:
             relation_1 = st.selectbox("Select first relation:", relation_options,
+                                        format_func = lambda x: x.replace("_", "-"), index = relation_1_index)
         with relation_2_col:
+            relation_2_options = [rel for rel in relation_options if rel != relation_1]
+            relation_2 = st.selectbox("Select second relation:", relation_2_options,
+                                        format_func = lambda x: x.replace("_", "-"), index = relation_2_index)
         # Get relation index
         rel_1_index = [i for i, etype in enumerate(edge_types) if etype == (source_node_type, relation_1, target_node_type)][0]
         target_nodes = target_nodes.sort_values(by = 'rel_1_score', ascending = False)
         target_nodes['rel_1_rank'] = np.arange(1, target_nodes.shape[0] + 1)
         # Compute correlation coefficient of scores
         corr = target_nodes['rel_1_score'].corr(target_nodes['rel_2_score'])
         spearman_corr = target_nodes['rel_1_score'].corr(target_nodes['rel_2_score'], method = 'spearman')
+        st.markdown(f"The correlation coefficient between {relation_1.replace('_', ' ')} and {relation_2.replace('_', ' ')} scores is:")
         st.markdown(f"**Pearson's $r$:** {corr:.2f} (Score)")
         st.markdown(f"**Spearman's $\\rho$:** {spearman_corr:.2f} (Rank)")
+        # Rename relations
+        relation_1 = relation_1.replace("_", " ").title()
+        relation_2 = relation_2.replace("_", " ").title()
         # Rename columns
         display_comp = target_nodes[['node_id', 'node_name', 'rel_1_rank', 'rel_2_rank', 'rel_1_score', 'rel_2_score', 'node_source']].copy()
         display_comp = display_comp.rename(columns = {
         rel_2_min = target_nodes[rel_2_column].min()
         rel_1_max = target_nodes[rel_1_column].max()
         rel_2_max = target_nodes[rel_2_column].max()
+        ax.plot([0, rel_1_max], [0, rel_2_max], color = 'red', linewidth = 1.5,
                 linestyle = '--', zorder = 3) # label = 'Equal Rank',
         ax.set_xlim(rel_1_min, rel_1_max)
         ax.set_ylim(rel_2_min, rel_2_max)
             st.dataframe(display_comp_styled, use_container_width = True, hide_index = True,
                         column_config={"Database": st.column_config.LinkColumn(width = "small",
                                                                             help = "Click to visit external database.",
+                                                                            display_text = map_db_names[target_node_type])})
         else:
             st.dataframe(display_comp, use_container_width = True, hide_index = True,
                         column_config={"Database": st.column_config.LinkColumn(width = "small",
                                                                             help = "Click to visit external database.",
+                                                                            display_text = map_db_names[target_node_type])})
+compare_scores()

pages/validate.py CHANGED Viewed

@@ -14,11 +14,11 @@ plt.rcParams['font.sans-serif'] = 'Arial'
 import matplotlib.colors as mcolors
 # Import metrics
-from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, f1_score
 # Custom and other imports
 import project_config
-from utils import load_kg, load_kg_edges
 # Redirect to app.py if not logged in, otherwise show the navigation menu
 menu_with_redirect()
@@ -32,7 +32,8 @@ st.image(str(project_config.MEDIA_DIR / 'validate_header.svg'), use_column_width
 st.subheader("Validate Predictions", divider = "green")
 # Print current query
-st.markdown(f"**Query:** {st.session_state.query['source_node'].replace('_', ' ')} ➡️ {st.session_state.query['relation'].replace('_', '-')} ➡️ {st.session_state.query['target_node_type'].replace('_', ' ')}")
 # Print split
 split = st.session_state.split
@@ -44,9 +45,14 @@ st.markdown(f"**Disease Split:** {st.session_state.split} ({num_nodes} nodes, {n
 # Get query and predictions
 source_node_type = st.session_state.query['source_node_type']
 source_node = st.session_state.query['source_node']
-relation = st.session_state.query['relation']
 target_node_type = st.session_state.query['target_node_type']
-predictions = st.session_state.predictions
 @st.experimental_fragment()
 def plot_options():
@@ -58,8 +64,7 @@ def plot_options():
     # Slider for x-axis limit
     axis_limits = st.slider('Define the range of ranks to visualize.',
-                            min_value=0, max_value=predictions['Rank'].max(),
-                            value=(0, predictions['Rank'].max()), step=1000)
     # Update session state
     st.session_state.show_lines = show_lines
@@ -72,12 +77,12 @@ plot_options()
 if 'show_lines' not in st.session_state:
     st.session_state.show_lines = False
 if 'axis_limits' not in st.session_state:
-    st.session_state.axis_limits = (0, predictions['Rank'].max())
 # Button to update plot
-col1, col2, col3 = st.columns([4, 2, 4])
 with col2:
-    update_button = st.button('Generate Plot')
 # Horizontal line
 st.markdown('---')
@@ -90,24 +95,14 @@ if update_button:
     # Convert tuple to hex
     def rgba_to_hex(rgba):
         return mcolors.to_hex(rgba[:3])
     with st.spinner('Searching known relationships...'):
-        # Subset existing edges
         edge_subset = kg_edges[(kg_edges.x_type == source_node_type) & (kg_edges.x_name == source_node)]
         edge_subset = edge_subset[edge_subset.y_type == target_node_type]
-        # Merge edge subset with predictions
-        edges_in_kg = pd.merge(predictions, edge_subset[['relation', 'y_id']], left_on = 'ID', right_on = 'y_id', how = 'right')
-        edges_in_kg = edges_in_kg.sort_values(by = 'Score', ascending = False)
-        edges_in_kg = edges_in_kg.drop(columns = 'y_id')
-        # Rename relation to ground-truth
-        edges_in_kg = edges_in_kg[['relation'] + [col for col in edges_in_kg.columns if col != 'relation']]
-        edges_in_kg = edges_in_kg.rename(columns = {'relation': 'Known Relation'})
     # If there exist edges in KG
-    if len(edges_in_kg) > 0:
         with st.spinner('Saving validation results...'):
@@ -119,118 +114,149 @@ if update_button:
             # Save validation results to session state
             st.session_state.validation = val_results
-        with st.spinner('Plotting known relationships...'):
-            # Define a color map for different relations
-            color_map = plt.get_cmap('tab10')
-            # Group by relation and create separate plots
-            relations = edges_in_kg['Known Relation'].unique()
-            for idx, relation in enumerate(relations):
-                relation_data = edges_in_kg[edges_in_kg['Known Relation'] == relation]
                 # Get a color from the color map
                 color = color_map(idx % color_map.N)
-                fig, ax = plt.subplots(figsize=(10, 5))
-                ax.plot(predictions['Rank'], predictions['Score'], color = 'black', linewidth = 1.5, zorder = 2)
-                ax.set_xlabel('Rank', fontsize=12)
-                ax.set_ylabel('Score', fontsize=12)
-                # ax.set_xlim(1, predictions['Rank'].max())
-                # ax.set_xlim(axis_limits)
-                ax.set_xlim(st.session_state.axis_limits)
-                for i, node in relation_data.iterrows():
-                    if st.session_state.show_lines:
-                        ax.axvline(node['Rank'], color=color, linestyle='--', label=node['Name'], zorder = 3)
-                    ax.scatter(node['Rank'], node['Score'], color=color, zorder=3) # s=15
-                    # ax.text(node['Rank'] + 100, node['Score'], node['Name'], fontsize=10, color=color)
-                # Also calculate and plot recall at K
-                ax2 = ax.twinx()
-                # Calculate recall at K for all Rank
-                recall_at_k = []
-                for k in range(1, predictions['Rank'].max() + 1):
-                    recall = 100*len(relation_data[relation_data['Rank'] <= k]) / len(relation_data)
-                    recall_at_k.append(recall)
-                ax2.plot(range(1, predictions['Rank'].max() + 1), recall_at_k,
-                        color = 'red', linestyle = '--', label = 'Recall at K', zorder = 4, linewidth = 2)
-                # Set labels
-                ax2.set_ylabel('Recall at K (%)', fontsize=12, color='red')
-                # Add grid
-                ax.grid(True, linestyle=':', alpha=0.5, zorder=0)
-                # ax.set_title(f'{relation.replace("_", "-")}')
-                # ax.legend()
                 color_hex = rgba_to_hex(color)
                 # Write header in color of relation
                 st.markdown(f"<h3 style='color:{color_hex}'>{relation.replace('_', ' ').title()}</h3>", unsafe_allow_html=True)
-                # Show plot
-                st.pyplot(fig)
-                # Create recall at K table
-                k_vals = [10, 50, 100, 500, 1000, 5000, 10000]
-                recall_at_k = []
-                for k in k_vals:
-                    recall = 100*len(relation_data[relation_data['Rank'] <= k]) / len(relation_data)
-                    recall = f"{recall:.2f}%"
-                    recall_at_k.append(recall)
-                recall_df = pd.DataFrame({'K': k_vals, 'Recall': recall_at_k})
-                # Transpose and display recall at K
-                recall_df = recall_df.T
-                recall_df.columns = [f"k = {k:.0f}" for k in recall_df.iloc[0]]
-                recall_df = recall_df.drop('K')
-                st.markdown('**Recall at $k$:**')
-                st.dataframe(recall_df, use_container_width=True)
-                # Compute other statistics
-                st.markdown('**Statistics:**')
-                # Binarize score
-                pred_threshold = 0.5
-                raw_score = predictions['Score']
-                binary_score = (raw_score > pred_threshold).astype(int)
-                true_label = np.zeros(len(predictions))
-                # Set true label to 1 for known relations
-                # Reset index
-                predictions_idx = predictions.copy().reset_index(drop = True)
-                true_label[predictions_idx[predictions_idx['ID'].isin(relation_data['ID'])].index] = 1
-                # Compute scores
-                accuracy = accuracy_score(true_label, binary_score)
-                ap = average_precision_score(true_label, raw_score)
-                f1 = f1_score(true_label, binary_score, average = 'micro')
-                try:
-                    auc = roc_auc_score(true_label, raw_score)
-                except ValueError:
-                    auc = 0.5
-                # Create dataframe
-                stats_df = pd.DataFrame({'Accuracy': [accuracy], 'AUC': [auc], 'AP': [ap], 'F1': [f1]})
-                stats_df.index = ["Value"]
-                st.dataframe(stats_df, use_container_width=True)
-                # Drop known relation column
-                st.markdown('**Known Relationships:**')
-                relation_data = relation_data.drop(columns = 'Known Relation')
-                if target_node_type not in ['disease', 'anatomy']:
-                    st.dataframe(relation_data, use_container_width=True,
-                                column_config={"Database": st.column_config.LinkColumn(width = "small",
-                                                                                        help = "Click to visit external database.",
-                                                                                        display_text = st.session_state.display_database)})
                 else:
-                    st.dataframe(relation_data, use_container_width=True)
     else:
-        st.error('No ground truth relationships found for the given query in the knowledge graph.', icon="✖️")

 import matplotlib.colors as mcolors
 # Import metrics
+from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, f1_score, balanced_accuracy_score
 # Custom and other imports
 import project_config
+from utils import load_kg, load_kg_edges, map_db_names
 # Redirect to app.py if not logged in, otherwise show the navigation menu
 menu_with_redirect()
 st.subheader("Validate Predictions", divider = "green")
 # Print current query
+# st.markdown(f"**Query:** {st.session_state.query['source_node'].replace('_', ' ')} ➡️ {st.session_state.query['relation'].replace('_', '-')} ➡️ {st.session_state.query['target_node_type'].replace('_', ' ')}")
+st.markdown(f"**Query:** {st.session_state.query['source_node'].replace('_', ' ')} ➡️ {st.session_state.query['target_node_type'].replace('_', ' ')}")
 # Print split
 split = st.session_state.split
 # Get query and predictions
 source_node_type = st.session_state.query['source_node_type']
 source_node = st.session_state.query['source_node']
+# relation = st.session_state.query['relation']
 target_node_type = st.session_state.query['target_node_type']
+predictions_rel = st.session_state.predictions_rel
+# Get relation options
+edge_types_df = pd.read_csv(project_config.DATA_DIR / 'kg_edge_types.csv')
+relation_options = edge_types_df[(edge_types_df.x_type == source_node_type) & (edge_types_df.y_type == target_node_type)].relation.unique()
+max_rank = predictions_rel[relation_options[0]]['Rank'].max()
 @st.experimental_fragment()
 def plot_options():
     # Slider for x-axis limit
     axis_limits = st.slider('Define the range of ranks to visualize.',
+                            min_value=0, max_value=max_rank, value=(0, max_rank), step=1000)
     # Update session state
     st.session_state.show_lines = show_lines
 if 'show_lines' not in st.session_state:
     st.session_state.show_lines = False
 if 'axis_limits' not in st.session_state:
+    st.session_state.axis_limits = (0, max_rank)
 # Button to update plot
+col1, col2, col3 = st.columns([2, 2, 2])
 with col2:
+    update_button = st.button('Generate Plot and Metrics')
 # Horizontal line
 st.markdown('---')
     # Convert tuple to hex
     def rgba_to_hex(rgba):
         return mcolors.to_hex(rgba[:3])
+    # Subset existing edges
     with st.spinner('Searching known relationships...'):
         edge_subset = kg_edges[(kg_edges.x_type == source_node_type) & (kg_edges.x_name == source_node)]
         edge_subset = edge_subset[edge_subset.y_type == target_node_type]
     # If there exist edges in KG
+    if len(edge_subset) > 0:
         with st.spinner('Saving validation results...'):
             # Save validation results to session state
             st.session_state.validation = val_results
+        # Define a color map for different relations
+        color_map = plt.get_cmap('tab10')
+        for idx, relation in enumerate(relation_options):
+            # Get predictions for specific relation
+            predictions = predictions_rel[relation]
+            # Merge edge subset with predictions
+            edge_subset_rel =  edge_subset[['relation', 'y_id']].copy()
+            edges_in_kg = pd.merge(predictions, edge_subset_rel, left_on = 'ID', right_on = 'y_id', how = 'right')
+            edges_in_kg = edges_in_kg.sort_values(by = 'Score', ascending = False)
+            edges_in_kg = edges_in_kg.drop(columns = 'y_id')
+            # Rename relation to ground-truth
+            edges_in_kg = edges_in_kg[['relation'] + [col for col in edges_in_kg.columns if col != 'relation']]
+            edges_in_kg = edges_in_kg.rename(columns = {'relation': 'Known Relation'})
+            with st.spinner('Plotting known relationships...'):
                 # Get a color from the color map
                 color = color_map(idx % color_map.N)
                 color_hex = rgba_to_hex(color)
                 # Write header in color of relation
                 st.markdown(f"<h3 style='color:{color_hex}'>{relation.replace('_', ' ').title()}</h3>", unsafe_allow_html=True)
+                # Group by relation and create separate plots
+                # relations = edges_in_kg['Known Relation'].unique()
+                # for idx, relation in enumerate(relations):
+                relation_data = edges_in_kg[edges_in_kg['Known Relation'] == relation]
+                if len(relation_data) > 0:
+                    fig, ax = plt.subplots(figsize=(10, 5))
+                    ax.plot(predictions['Rank'], predictions['Score'], color = 'black', linewidth = 1.5, zorder = 2)
+                    ax.set_xlabel('Rank', fontsize=12)
+                    ax.set_ylabel('Score', fontsize=12)
+                    # ax.set_xlim(1, predictions['Rank'].max())
+                    # ax.set_xlim(axis_limits)
+                    ax.set_xlim(st.session_state.axis_limits)
+                    for i, node in relation_data.iterrows():
+                        if st.session_state.show_lines:
+                            ax.axvline(node['Rank'], color=color, linestyle='--', label=node['Name'], zorder = 3)
+                        ax.scatter(node['Rank'], node['Score'], color=color, zorder=3) # s=15
+                        # ax.text(node['Rank'] + 100, node['Score'], node['Name'], fontsize=10, color=color)
+                    # Also calculate and plot recall at K
+                    ax2 = ax.twinx()
+                    # Calculate recall at K for all Rank
+                    recall_at_k = []
+                    for k in range(1, predictions['Rank'].max() + 1):
+                        recall = 100*len(relation_data[relation_data['Rank'] <= k]) / len(relation_data)
+                        recall_at_k.append(recall)
+                    ax2.plot(range(1, predictions['Rank'].max() + 1), recall_at_k,
+                            color = 'red', linestyle = '--', label = 'Recall at K', zorder = 4, linewidth = 2)
+                    # Set labels
+                    ax2.set_ylabel('Recall at K (%)', fontsize=12, color='red')
+                    # Add grid
+                    ax.grid(True, linestyle=':', alpha=0.5, zorder=0)
+                    # ax.set_title(f'{relation.replace("_", "-")}')
+                    # ax.legend()
+                    # Show plot
+                    st.pyplot(fig)
+                    # Create recall at K table
+                    k_vals = [10, 50, 100, 500, 1000, 5000, 10000]
+                    recall_at_k = []
+                    for k in k_vals:
+                        recall = 100*len(relation_data[relation_data['Rank'] <= k]) / len(relation_data)
+                        recall = f"{recall:.2f}%"
+                        recall_at_k.append(recall)
+                    recall_df = pd.DataFrame({'K': k_vals, 'Recall': recall_at_k})
+                    # Transpose and display recall at K
+                    recall_df = recall_df.T
+                    recall_df.columns = [f"k = {k:.0f}" for k in recall_df.iloc[0]]
+                    recall_df = recall_df.drop('K')
+                    st.markdown('**Recall at $k$:**')
+                    st.dataframe(recall_df, use_container_width=True)
+                    # Compute other statistics
+                    st.markdown('**Statistics:**')
+                    # Binarize score
+                    pred_threshold = 0.5
+                    raw_score = predictions['Score']
+                    binary_score = (raw_score > pred_threshold).astype(int)
+                    true_label = np.zeros(len(predictions))
+                    # Set true label to 1 for known relations
+                    # Reset index
+                    predictions_idx = predictions.copy().reset_index(drop = True)
+                    true_label[predictions_idx[predictions_idx['ID'].isin(relation_data['ID'])].index] = 1
+                    # Compute scores
+                    accuracy = accuracy_score(true_label, binary_score)
+                    balanced_accuracy = balanced_accuracy_score(true_label, binary_score)
+                    accuracy = f"{100*accuracy:.2f}%"
+                    balanced_accuracy = f"{100*balanced_accuracy:.2f}%"
+                    ap = average_precision_score(true_label, raw_score)
+                    f1 = f1_score(true_label, binary_score, average = 'micro')
+                    try:
+                        auc = roc_auc_score(true_label, raw_score)
+                    except ValueError:
+                        auc = 0.5
+                    # Create dataframe
+                    stats_df = pd.DataFrame({
+                        'Acc.': [accuracy], 'Balanced Acc.': [balanced_accuracy],
+                        'AUC': [auc], 'AP': [ap], 'F1': [f1]
+                    })
+                    stats_df.index = ["Value"]
+                    st.dataframe(stats_df, use_container_width=True)
+                    # Drop known relation column
+                    st.markdown('**Known Relationships:**')
+                    relation_data = relation_data.drop(columns = 'Known Relation')
+                    relation_data['Rank'] = relation_data['Rank'].apply(lambda x: f"{x} (top {(100*x/predictions.shape[0]):.2f}%)")
+                    if target_node_type not in ['disease', 'anatomy']:
+                        st.dataframe(relation_data, use_container_width=True, hide_index = True,
+                                    column_config={"Database": st.column_config.LinkColumn(width = "small",
+                                                                                            help = "Click to visit external database.",
+                                                                                            display_text = map_db_names[target_node_type],)})
+                    else:
+                        st.dataframe(relation_data, use_container_width=True, hide_index = True)
                 else:
+                    st.error(f"No ground truth {relation.replace('_', ' ')} edges found for {source_node} in the knowledge graph.", icon="✖️")
     else:
+        st.error(f"No ground truth {target_node_type} relationships found for {source_node} in the knowledge graph.", icon="✖️")

utils.py CHANGED Viewed

@@ -25,4 +25,34 @@ def capitalize_after_slash(s):
     capitalized_parts = [part.title() for part in parts]
     # Rejoin the parts with slashes
     capitalized_string = '/'.join(capitalized_parts).replace('_', ' ')
-    return capitalized_string

     capitalized_parts = [part.title() for part in parts]
     # Rejoin the parts with slashes
     capitalized_string = '/'.join(capitalized_parts).replace('_', ' ')
+    return capitalized_string
+# Define dictionary mapping node types to database URLs
+map_dbs = {
+    'gene/protein': lambda x: f"https://ncbi.nlm.nih.gov/gene/?term={x}",
+    'drug': lambda x: f"https://go.drugbank.com/drugs/{x}",
+    'effect/phenotype': lambda x: f"https://hpo.jax.org/app/browse/term/HP:{x.zfill(7)}", # pad with 0s to 7 digits
+    'disease': lambda x: x, # MONDO
+    # pad with 0s to 7 digits
+    'biological_process': lambda x: f"https://amigo.geneontology.org/amigo/term/GO:{x.zfill(7)}",
+    'molecular_function': lambda x: f"https://amigo.geneontology.org/amigo/term/GO:{x.zfill(7)}",
+    'cellular_component': lambda x: f"https://amigo.geneontology.org/amigo/term/GO:{x.zfill(7)}",
+    'exposure': lambda x: f"https://ctdbase.org/detail.go?type=chem&acc={x}",
+    'pathway': lambda x: f"https://reactome.org/content/detail/{x}",
+    'anatomy': lambda x: x,
+}
+# Define dictionary mapping node types to database names
+map_db_names = {
+    'gene/protein': 'NCBI',
+    'drug': 'DrugBank',
+    'effect/phenotype': 'HPO',
+    'disease': 'MONDO',
+    'biological_process': 'GO: BP',
+    'molecular_function': 'GO: MF',
+    'cellular_component': 'GO: CC',
+    'exposure': 'CTD',
+    'pathway': 'Reactome',
+    'anatomy': 'UBERON',
+}