Spaces:

ayushnoori
/

clinical-drug-repurposing

Sleeping

App Files Files Community

ayushnoori commited on Jun 29, 2024

Commit

11b8c2d

1 Parent(s): 9a01b6b

Add statistics and recall to validation, but for ORIGINAL EDGE TYPE

Browse files

Files changed (1) hide show

pages/validate.py +184 -77

pages/validate.py CHANGED Viewed

@@ -13,6 +13,9 @@ import matplotlib.pyplot as plt
 plt.rcParams['font.sans-serif'] = 'Arial'
 import matplotlib.colors as mcolors
 # Custom and other imports
 import project_config
 from utils import load_kg, load_kg_edges
@@ -45,85 +48,189 @@ relation = st.session_state.query['relation']
 target_node_type = st.session_state.query['target_node_type']
 predictions = st.session_state.predictions
-kg_nodes = load_kg()
-kg_edges = load_kg_edges()
-# Convert tuple to hex
-def rgba_to_hex(rgba):
-    return mcolors.to_hex(rgba[:3])
-with st.spinner('Searching known relationships...'):
-    # Subset existing edges
-    edge_subset = kg_edges[(kg_edges.x_type == source_node_type) & (kg_edges.x_name == source_node)]
-    edge_subset = edge_subset[edge_subset.y_type == target_node_type]
-    # Merge edge subset with predictions
-    edges_in_kg = pd.merge(predictions, edge_subset[['relation', 'y_id']], left_on = 'ID', right_on = 'y_id', how = 'right')
-    edges_in_kg = edges_in_kg.sort_values(by = 'Score', ascending = False)
-    edges_in_kg = edges_in_kg.drop(columns = 'y_id')
-    # Rename relation to ground-truth
-    edges_in_kg = edges_in_kg[['relation'] + [col for col in edges_in_kg.columns if col != 'relation']]
-    edges_in_kg = edges_in_kg.rename(columns = {'relation': 'Known Relation'})
-# If there exist edges in KG
-if len(edges_in_kg) > 0:
-    with st.spinner('Saving validation results...'):
-        # Cast long to wide
-        val_results = edge_subset[['relation', 'y_id']].pivot_table(index='y_id', columns='relation', aggfunc='size', fill_value=0)
-        val_results = (val_results > 0).astype(int).reset_index()
-        val_results.columns = [val_results.columns[0]] + [x.replace('_', ' ').title() for x in val_results.columns[1:]]
-        # Save validation results to session state
-        st.session_state.validation = val_results
-    with st.spinner('Plotting known relationships...'):
-        # Define a color map for different relations
-        color_map = plt.get_cmap('tab10')
-        # Group by relation and create separate plots
-        relations = edges_in_kg['Known Relation'].unique()
-        for idx, relation in enumerate(relations):
-            relation_data = edges_in_kg[edges_in_kg['Known Relation'] == relation]
-            # Get a color from the color map
-            color = color_map(idx % color_map.N)
-            fig, ax = plt.subplots(figsize=(10, 3))
-            ax.plot(predictions['Rank'], predictions['Score'])
-            ax.set_xlabel('Rank', fontsize=12)
-            ax.set_ylabel('Score', fontsize=12)
-            ax.set_xlim(1, predictions['Rank'].max())
-            for i, node in relation_data.iterrows():
-                ax.axvline(node['Rank'], color=color, linestyle='--', label=node['Name'])
-                # ax.text(node['Rank'] + 100, node['Score'], node['Name'], fontsize=10, color=color)
-            # ax.set_title(f'{relation.replace("_", "-")}')
-            # ax.legend()
-            color_hex = rgba_to_hex(color)
-            # Write header in color of relation
-            st.markdown(f"<h3 style='color:{color_hex}'>{relation.replace('_', ' ').title()}</h2>", unsafe_allow_html=True)
-            # Show plot
-            st.pyplot(fig)
-            # Drop known relation column
-            relation_data = relation_data.drop(columns = 'Known Relation')
-            if target_node_type not in ['disease', 'anatomy']:
-                st.dataframe(relation_data, use_container_width=True,
-                            column_config={"Database": st.column_config.LinkColumn(width = "small",
-                                                                                    help = "Click to visit external database.",
-                                                                                    display_text = st.session_state.display_database)})
-            else:
-                st.dataframe(relation_data, use_container_width=True)
-else:
-    st.error('No ground truth relationships found for the given query in the knowledge graph.', icon="✖️")

 plt.rcParams['font.sans-serif'] = 'Arial'
 import matplotlib.colors as mcolors
+# Import metrics
+from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, f1_score
 # Custom and other imports
 import project_config
 from utils import load_kg, load_kg_edges
 target_node_type = st.session_state.query['target_node_type']
 predictions = st.session_state.predictions
+@st.experimental_fragment()
+def plot_options():
+    st.markdown("<h5 style='margin-top: 10px;'>Plotting Options</h5>", unsafe_allow_html=True)
+    # Checkbox to show lines
+    show_lines = st.checkbox('Show rug plot of existing edges?', value = False)
+    # Slider for x-axis limit
+    axis_limits = st.slider('Define the range of ranks to visualize.',
+                            min_value=0, max_value=predictions['Rank'].max(),
+                            value=(0, predictions['Rank'].max()), step=1000)
+    # Update session state
+    st.session_state.show_lines = show_lines
+    st.session_state.axis_limits = axis_limits
+# Get plot options
+plot_options()
+# Set default options
+if 'show_lines' not in st.session_state:
+    st.session_state.show_lines = False
+if 'axis_limits' not in st.session_state:
+    st.session_state.axis_limits = (0, predictions['Rank'].max())
+# Button to update plot
+col1, col2, col3 = st.columns([4, 2, 4])
+with col2:
+    update_button = st.button('Generate Plot')
+# Horizontal line
+st.markdown('---')
+if update_button:
+    kg_nodes = load_kg()
+    kg_edges = load_kg_edges()
+    # Convert tuple to hex
+    def rgba_to_hex(rgba):
+        return mcolors.to_hex(rgba[:3])
+    with st.spinner('Searching known relationships...'):
+        # Subset existing edges
+        edge_subset = kg_edges[(kg_edges.x_type == source_node_type) & (kg_edges.x_name == source_node)]
+        edge_subset = edge_subset[edge_subset.y_type == target_node_type]
+        # Merge edge subset with predictions
+        edges_in_kg = pd.merge(predictions, edge_subset[['relation', 'y_id']], left_on = 'ID', right_on = 'y_id', how = 'right')
+        edges_in_kg = edges_in_kg.sort_values(by = 'Score', ascending = False)
+        edges_in_kg = edges_in_kg.drop(columns = 'y_id')
+        # Rename relation to ground-truth
+        edges_in_kg = edges_in_kg[['relation'] + [col for col in edges_in_kg.columns if col != 'relation']]
+        edges_in_kg = edges_in_kg.rename(columns = {'relation': 'Known Relation'})
+    # If there exist edges in KG
+    if len(edges_in_kg) > 0:
+        with st.spinner('Saving validation results...'):
+            # Cast long to wide
+            val_results = edge_subset[['relation', 'y_id']].pivot_table(index='y_id', columns='relation', aggfunc='size', fill_value=0)
+            val_results = (val_results > 0).astype(int).reset_index()
+            val_results.columns = [val_results.columns[0]] + [x.replace('_', ' ').title() for x in val_results.columns[1:]]
+            # Save validation results to session state
+            st.session_state.validation = val_results
+        with st.spinner('Plotting known relationships...'):
+            # Define a color map for different relations
+            color_map = plt.get_cmap('tab10')
+            # Group by relation and create separate plots
+            relations = edges_in_kg['Known Relation'].unique()
+            for idx, relation in enumerate(relations):
+                relation_data = edges_in_kg[edges_in_kg['Known Relation'] == relation]
+                # Get a color from the color map
+                color = color_map(idx % color_map.N)
+                fig, ax = plt.subplots(figsize=(10, 5))
+                ax.plot(predictions['Rank'], predictions['Score'], color = 'black', linewidth = 1.5, zorder = 2)
+                ax.set_xlabel('Rank', fontsize=12)
+                ax.set_ylabel('Score', fontsize=12)
+                # ax.set_xlim(1, predictions['Rank'].max())
+                # ax.set_xlim(axis_limits)
+                ax.set_xlim(st.session_state.axis_limits)
+                for i, node in relation_data.iterrows():
+                    if st.session_state.show_lines:
+                        ax.axvline(node['Rank'], color=color, linestyle='--', label=node['Name'], zorder = 3)
+                    ax.scatter(node['Rank'], node['Score'], color=color, zorder=3) # s=15
+                    # ax.text(node['Rank'] + 100, node['Score'], node['Name'], fontsize=10, color=color)
+                # Also calculate and plot recall at K
+                ax2 = ax.twinx()
+                # Calculate recall at K for all Rank
+                recall_at_k = []
+                for k in range(1, predictions['Rank'].max() + 1):
+                    recall = 100*len(relation_data[relation_data['Rank'] <= k]) / len(relation_data)
+                    recall_at_k.append(recall)
+                ax2.plot(range(1, predictions['Rank'].max() + 1), recall_at_k,
+                        color = 'red', linestyle = '--', label = 'Recall at K', zorder = 4, linewidth = 2)
+                # Set labels
+                ax2.set_ylabel('Recall at K (%)', fontsize=12, color='red')
+                # Add grid
+                ax.grid(True, linestyle=':', alpha=0.5, zorder=0)
+                # ax.set_title(f'{relation.replace("_", "-")}')
+                # ax.legend()
+                color_hex = rgba_to_hex(color)
+                # Write header in color of relation
+                st.markdown(f"<h3 style='color:{color_hex}'>{relation.replace('_', ' ').title()}</h3>", unsafe_allow_html=True)
+                # Show plot
+                st.pyplot(fig)
+                # Create recall at K table
+                k_vals = [10, 50, 100, 500, 1000, 5000, 10000]
+                recall_at_k = []
+                for k in k_vals:
+                    recall = 100*len(relation_data[relation_data['Rank'] <= k]) / len(relation_data)
+                    recall = f"{recall:.2f}%"
+                    recall_at_k.append(recall)
+                recall_df = pd.DataFrame({'K': k_vals, 'Recall': recall_at_k})
+                # Transpose and display recall at K
+                recall_df = recall_df.T
+                recall_df.columns = [f"k = {k:.0f}" for k in recall_df.iloc[0]]
+                recall_df = recall_df.drop('K')
+                st.markdown('**Recall at $k$:**')
+                st.dataframe(recall_df, use_container_width=True)
+                # Compute other statistics
+                st.markdown('**Statistics:**')
+                # Binarize score
+                pred_threshold = 0.5
+                raw_score = predictions['Score']
+                binary_score = (raw_score > pred_threshold).astype(int)
+                true_label = np.zeros(len(predictions))
+                # Set true label to 1 for known relations
+                # Reset index
+                predictions_idx = predictions.copy().reset_index(drop = True)
+                true_label[predictions_idx[predictions_idx['ID'].isin(relation_data['ID'])].index] = 1
+                # Compute scores
+                accuracy = accuracy_score(true_label, binary_score)
+                ap = average_precision_score(true_label, raw_score)
+                f1 = f1_score(true_label, binary_score, average = 'micro')
+                try:
+                    auc = roc_auc_score(true_label, raw_score)
+                except ValueError:
+                    auc = 0.5
+                # Create dataframe
+                stats_df = pd.DataFrame({'Accuracy': [accuracy], 'AUC': [auc], 'AP': [ap], 'F1': [f1]})
+                stats_df.index = ["Value"]
+                st.dataframe(stats_df, use_container_width=True)
+                # Drop known relation column
+                st.markdown('**Known Relationships:**')
+                relation_data = relation_data.drop(columns = 'Known Relation')
+                if target_node_type not in ['disease', 'anatomy']:
+                    st.dataframe(relation_data, use_container_width=True,
+                                column_config={"Database": st.column_config.LinkColumn(width = "small",
+                                                                                        help = "Click to visit external database.",
+                                                                                        display_text = st.session_state.display_database)})
+                else:
+                    st.dataframe(relation_data, use_container_width=True)
+    else:
+        st.error('No ground truth relationships found for the given query in the knowledge graph.', icon="✖️")