Spaces:

pdjohn
/

causev

Running

App Files Files Community

pdjohn commited on Oct 27, 2024

Commit

9525dec

1 Parent(s): 4f0f736

Sankey

Browse files

Files changed (3) hide show

app.py +26 -4
data/indicator_cause_sentence_metadata.tsv +0 -0
plot.py +99 -15

app.py CHANGED Viewed

@@ -4,9 +4,13 @@ from transformers import AutoTokenizer, AutoModelForTokenClassification
 from annotated_text import annotated_text
 import pandas as pd
 import plotly.express as px
-from plot import indicator_chart, causes_chart, scatter_plot
 import os
 # Load the trained model and tokenizer
 model_directory = "norygano/causalBERT"
 tokenizer = AutoTokenizer.from_pretrained(model_directory, add_prefix_space=True)
@@ -30,7 +34,7 @@ st.markdown("[Model](https://huggingface.co/norygano/causalBERT) | [Data](https:
 st.write("Tags indicators and causes in explicit attributions of causality. GER only (atm)")
 # Create tabs
-tab1, tab2, tab3, tab4 = st.tabs(["Prompt", "Indicators", "Causes", "Scatter"])
 # Prompt Tab
 with tab1:
@@ -99,5 +103,23 @@ with tab3:
 with tab4:
     st.write("## Scatter")
-    fig_scatter = scatter_plot()
-    st.plotly_chart(fig_scatter, use_container_width=True)

 from annotated_text import annotated_text
 import pandas as pd
 import plotly.express as px
+from plot import indicator_chart, causes_chart, scatter, sankey
 import os
+# Define initial threshold values at the top of the script
+default_cause_threshold = 20
+default_indicator_threshold = 3
 # Load the trained model and tokenizer
 model_directory = "norygano/causalBERT"
 tokenizer = AutoTokenizer.from_pretrained(model_directory, add_prefix_space=True)
 st.write("Tags indicators and causes in explicit attributions of causality. GER only (atm)")
 # Create tabs
+tab1, tab2, tab3, tab4, tab5 = st.tabs(["Prompt", "Indicators", "Causes", "Scatter", "Sankey"])
 # Prompt Tab
 with tab1:
 with tab4:
     st.write("## Scatter")
+    fig_scatter = scatter()
+    st.plotly_chart(fig_scatter, use_container_width=True)
+with tab5:
+    st.write("## Sankey")
+    # Fixed height for the Sankey chart container
+    with st.container():
+        # Retrieve slider values and generate the diagram
+        cause_threshold = st.session_state.get("cause_threshold", default_cause_threshold)
+        indicator_threshold = st.session_state.get("indicator_threshold", default_indicator_threshold)
+        fig_sankey = sankey(cause_threshold=cause_threshold, indicator_threshold=indicator_threshold)
+        st.plotly_chart(fig_sankey, use_container_width=True)
+    # Place sliders below the chart container
+    with st.container():
+        st.write("Adjust thresholds for Sankey diagram:")
+        cause_threshold = st.slider("Cause Threshold", min_value=1, max_value=100, value=default_cause_threshold, key="cause_threshold")
+        indicator_threshold = st.slider("Indicator Threshold", min_value=1, max_value=100, value=default_indicator_threshold, key="indicator_threshold")

data/indicator_cause_sentence_metadata.tsv CHANGED Viewed

The diff for this file is too large to render. See raw diff

plot.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import pandas as pd
 import plotly.express as px
 import os
 import umap
@@ -64,7 +65,7 @@ def indicator_chart(chart_type='overall'):
     fig.update_layout(
         xaxis=dict(showline=True),
-        yaxis=dict(showticklabels=True, title=''),
         bargap=0.05,
         showlegend=(chart_type == 'individual')
     )
@@ -107,17 +108,17 @@ def causes_chart():
     return fig
-def scatter_plot(include_modality=False):
     data_file = os.path.join('data', 'feature_matrix.tsv')
     df = pd.read_csv(data_file, sep='\t')
-    # Exclude sentences without any indicators (all indicator columns are 0), causes, or modalities (if included)
     indicator_columns = [col for col in df.columns if col.startswith('indicator_')]
     cause_columns = [col for col in df.columns if col.startswith('cause_')]
     modality_columns = [col for col in df.columns if col.startswith('modality_')]
     df_filtered = df[(df[indicator_columns].sum(axis=1) > 0) |
-                        (df[cause_columns].sum(axis=1) > 0)]
     # Exclude indicator '!besprechen'
     indicator_columns = [col for col in indicator_columns if 'indicator_!besprechen' not in col]
@@ -129,30 +130,26 @@ def scatter_plot(include_modality=False):
     # Further filter to exclude entries without any valid indicators
     df_filtered = df_filtered[df_filtered[indicators_to_keep].sum(axis=1) > 0]
-    # Exclude non-feature columns (metadata and sentence text) for dimensionality reduction
     columns_to_drop = ['subfolder']
     if not include_modality:
         columns_to_drop += modality_columns  # Drop modality columns if not included
     features = df_filtered.drop(columns=columns_to_drop)
-    # Fill NaN values with 0 for the feature matrix
     features_clean = features.fillna(0)
-    # Store the relevant metadata separately to ensure it is aligned correctly with the dimensionality reduction results
     metadata = df_filtered[['subfolder']].copy()
-    # Remove the 'indicator_' prefix for indicators and ensure only indicators with at least 10 occurrences are included
     metadata['indicator'] = df_filtered[indicators_to_keep].apply(lambda row: ', '.join([indicator.replace('indicator_', '') for indicator in indicators_to_keep if row[indicator] > 0]), axis=1)
-    # Collect all non-zero causes as a string (multiple causes per sentence)
     metadata['cause'] = df_filtered[cause_columns].apply(lambda row: ', '.join([cause.replace('cause_', '') for cause in cause_columns if row[cause] > 0]), axis=1)
-    # Perform UMAP dimensionality reduction
-    reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=50, metric='cosine')
     reduced_features = reducer.fit_transform(features_clean)
     df_reduced = pd.DataFrame(reduced_features, columns=['Component 1', 'Component 2'])
     df_reduced = pd.concat([df_reduced, metadata.reset_index(drop=True)], axis=1)
-    # Plotting the scatter plot with Plotly Express
     hover_data = {'cause'}
     if include_modality:
         hover_data['Modality'] = True
@@ -161,17 +158,104 @@ def scatter_plot(include_modality=False):
         df_reduced,
         x='Component 1',
         y='Component 2',
-        color='subfolder',
         hover_data=hover_data,
         labels={'Component 1': 'UMAP Dim 1', 'Component 2': 'UMAP Dim 2'},
         color_discrete_sequence=px.colors.qualitative.D3
     )
     fig.update_layout(
         xaxis=dict(showgrid=False),
         yaxis=dict(showgrid=False),
-        showlegend=True
     )
     return fig

 import pandas as pd
 import plotly.express as px
+import plotly.graph_objects as go
 import os
 import umap
     fig.update_layout(
         xaxis=dict(showline=True),
+        yaxis=dict(showticklabels=True, title='', tickformat=".0%" if chart_type == 'overall' else None),
         bargap=0.05,
         showlegend=(chart_type == 'individual')
     )
     return fig
+def scatter(include_modality=False):
     data_file = os.path.join('data', 'feature_matrix.tsv')
     df = pd.read_csv(data_file, sep='\t')
+    # Exclude sentences without any indicators, causes, or modalities (if included)
     indicator_columns = [col for col in df.columns if col.startswith('indicator_')]
     cause_columns = [col for col in df.columns if col.startswith('cause_')]
     modality_columns = [col for col in df.columns if col.startswith('modality_')]
     df_filtered = df[(df[indicator_columns].sum(axis=1) > 0) |
+                     (df[cause_columns].sum(axis=1) > 0)]
     # Exclude indicator '!besprechen'
     indicator_columns = [col for col in indicator_columns if 'indicator_!besprechen' not in col]
     # Further filter to exclude entries without any valid indicators
     df_filtered = df_filtered[df_filtered[indicators_to_keep].sum(axis=1) > 0]
+    # Exclude non-feature columns for dimensionality reduction
     columns_to_drop = ['subfolder']
     if not include_modality:
         columns_to_drop += modality_columns  # Drop modality columns if not included
     features = df_filtered.drop(columns=columns_to_drop)
     features_clean = features.fillna(0)
+    # Prepare metadata
     metadata = df_filtered[['subfolder']].copy()
     metadata['indicator'] = df_filtered[indicators_to_keep].apply(lambda row: ', '.join([indicator.replace('indicator_', '') for indicator in indicators_to_keep if row[indicator] > 0]), axis=1)
     metadata['cause'] = df_filtered[cause_columns].apply(lambda row: ', '.join([cause.replace('cause_', '') for cause in cause_columns if row[cause] > 0]), axis=1)
+    # UMAP dimensionality reduction
+    reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=50, n_jobs=1, metric='cosine')
     reduced_features = reducer.fit_transform(features_clean)
     df_reduced = pd.DataFrame(reduced_features, columns=['Component 1', 'Component 2'])
     df_reduced = pd.concat([df_reduced, metadata.reset_index(drop=True)], axis=1)
+    # Plotting the scatter plot
     hover_data = {'cause'}
     if include_modality:
         hover_data['Modality'] = True
         df_reduced,
         x='Component 1',
         y='Component 2',
+        color='subfolder',  # Only subfolder colors will show in the legend
+        symbol='indicator',  # Symbols for indicators, without showing in legend
         hover_data=hover_data,
         labels={'Component 1': 'UMAP Dim 1', 'Component 2': 'UMAP Dim 2'},
         color_discrete_sequence=px.colors.qualitative.D3
     )
+    # Hide the legend for all symbol traces (indicator-based traces)
+    for trace in fig.data:
+        if trace.marker.symbol is not None:  # This targets symbol traces
+            trace.showlegend = False
     fig.update_layout(
         xaxis=dict(showgrid=False),
         yaxis=dict(showgrid=False),
+        showlegend=True,  # Show only the subfolder legend
+        legend=dict(
+            title="Term",  # Adjust title to indicate the subfolder legend
+            yanchor="top",
+            xanchor="left",
+            borderwidth=1,
+        ),
     )
     return fig
+def sankey(cause_threshold=10, indicator_threshold=5):
+    # Load the data
+    data_file = os.path.join('data', 'indicator_cause_sentence_metadata.tsv')
+    df = pd.read_csv(data_file, sep='\t')
+    # Remove rows with NaN values in 'cause', 'indicator', or 'subfolder' columns
+    df = df.dropna(subset=['cause', 'indicator', 'subfolder'])
+    # Strip '_nk' from 'subfolder' values
+    df['subfolder'] = df['subfolder'].str.replace('_nk', '')
+    # Calculate overall counts for each cause and indicator
+    cause_counts = df['cause'].value_counts()
+    indicator_counts = df['indicator'].value_counts()
+    # Filter causes and indicators that meet their respective thresholds
+    valid_causes = cause_counts[cause_counts >= cause_threshold].index
+    valid_indicators = indicator_counts[indicator_counts >= indicator_threshold].index
+    # Filter the DataFrame to include only rows with causes and indicators that meet the thresholds
+    df_filtered = df[(df['cause'].isin(valid_causes)) & (df['indicator'].isin(valid_indicators))]
+    # Calculate pair counts for cause -> indicator and indicator -> subfolder
+    cause_indicator_counts = df_filtered.groupby(['cause', 'indicator']).size().reset_index(name='count')
+    indicator_subfolder_counts = df_filtered.groupby(['indicator', 'subfolder']).size().reset_index(name='count')
+    # Generate unique labels for Sankey nodes, including all causes, indicators, and subfolders
+    causes = df_filtered['cause'].unique()
+    indicators = df_filtered['indicator'].unique()
+    subfolders = df_filtered['subfolder'].unique()
+    all_labels = list(causes) + list(indicators) + list(subfolders)
+    # Mapping of each label to an index for Sankey node
+    label_to_index = {label: idx for idx, label in enumerate(all_labels)}
+    # Define sources, targets, and values for the Sankey diagram
+    sources = []
+    targets = []
+    values = []
+    # Add cause -> indicator links
+    for _, row in cause_indicator_counts.iterrows():
+        if row['cause'] in label_to_index and row['indicator'] in label_to_index:
+            sources.append(label_to_index[row['cause']])
+            targets.append(label_to_index[row['indicator']])
+            values.append(row['count'])
+    # Add indicator -> subfolder links
+    for _, row in indicator_subfolder_counts.iterrows():
+        if row['indicator'] in label_to_index and row['subfolder'] in label_to_index:
+            sources.append(label_to_index[row['indicator']])
+            targets.append(label_to_index[row['subfolder']])
+            values.append(row['count'])
+    fig = go.Figure(data=[go.Sankey(
+        node=dict(
+            pad=15,
+            thickness=20,
+            line=dict(color="black", width=0.5),
+            label=all_labels,
+        ),
+        link=dict(
+            source=sources,
+            target=targets,
+            value=values
+        )
+    )])
+    fig.update_layout(
+        autosize=False,   # Disable automatic resizing
+        width=500,        # Fixed width
+        height=500,       # Fixed height
+    )
+    return fig