Spaces:

norygano
/

causev

Running

App Files Files Community

norygano commited on Oct 28, 2024

Commit

7e996cc

1 Parent(s): 6b22889

Added plot.py

Browse files

Files changed (3) hide show

app.py +65 -29
data/feature_matrix.tsv +0 -0
plot.py +265 -238

app.py CHANGED Viewed

@@ -2,14 +2,17 @@ import streamlit as st
 import torch
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 from annotated_text import annotated_text
-import pandas as pd
-import plotly.express as px
-from plot import indicator_chart, causes_chart, scatter, sankey
 import os
 # Define initial threshold values at the top of the script
 default_cause_threshold = 20
 default_indicator_threshold = 15
 # Load the trained model and tokenizer
 model_directory = "norygano/causalBERT"
@@ -30,16 +33,16 @@ st.markdown(
     """,
     unsafe_allow_html=True
 )
-st.markdown("[Model](https://huggingface.co/norygano/causalBERT) | [Data](https://huggingface.co/datasets/norygano/causenv) | [Project](https://www.uni-trier.de/universitaet/fachbereiche-faecher/fachbereich-ii/faecher/germanistik/professurenfachteile/germanistische-linguistik/professoren/prof-dr-martin-wengeler/kontroverse-diskurse/individium-gesellschaft)")
-st.write("Tags indicators and causes in explicit attributions of causality.")
 # Create tabs
-tab1, tab2, tab3, tab4, tab5 = st.tabs(["Prompt", "Indicators", "Causes", "Scatter", "Sankey"])
 # Prompt Tab
 with tab1:
     sentences_input = st.text_area("*Sentences (one per line)*", "\n".join([
-        "Autos stehen im Verdacht, Waldsterben zu verursachen.",
         "Fußball führt zu Waldschäden.",
         "Haustüren tragen zum Betonsterben bei.",
     ]), placeholder="German only (currently)")
@@ -82,36 +85,69 @@ with tab1:
             annotated_text(*annotations)
             st.write("---")
-# Research Insights Tab
 with tab2:
-    # Overall
-    st.subheader("Overall")
-    fig_overall = indicator_chart(chart_type='overall')
-    st.plotly_chart(fig_overall, use_container_width=True)
-    # Individual Indicators Chart
-    st.subheader("Individual")
-    fig_individual = indicator_chart(chart_type='individual')
-    st.plotly_chart(fig_individual, use_container_width=True)
 with tab3:
-    fig_causes = causes_chart()
-    st.plotly_chart(fig_causes, use_container_width=True)
 with tab4:
-    fig_scatter = scatter()
     st.plotly_chart(fig_scatter, use_container_width=True)
 with tab5:
-    # Fixed height for the Sankey chart container
     with st.container():
-        # Retrieve slider values and generate the diagram
-        cause_threshold = st.session_state.get("cause_threshold", default_cause_threshold)
-        indicator_threshold = st.session_state.get("indicator_threshold", default_indicator_threshold)
-        fig_sankey = sankey(cause_threshold=cause_threshold, indicator_threshold=indicator_threshold)
         st.plotly_chart(fig_sankey, use_container_width=True)
-    # Place sliders below the chart container
     with st.container():
-        cause_threshold = st.slider("Cause >", min_value=1, max_value=100, value=default_cause_threshold, key="cause_threshold")
-        indicator_threshold = st.slider("Indicator >", min_value=1, max_value=100, value=default_indicator_threshold, key="indicator_threshold")

 import torch
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 from annotated_text import annotated_text
 import os
+from plot import Plot  # Assuming the class is saved in diagram_generator.py
 # Define initial threshold values at the top of the script
 default_cause_threshold = 20
 default_indicator_threshold = 15
+default_cause_threshold_sankey = 20
+default_indicator_threshold_sankey = 15
+# Initialize Plots
+plot = Plot()
 # Load the trained model and tokenizer
 model_directory = "norygano/causalBERT"
     """,
     unsafe_allow_html=True
 )
+st.markdown("[Weights](https://huggingface.co/norygano/causalBERT) | [Data](https://huggingface.co/datasets/norygano/causenv) | [Project](https://www.uni-trier.de/universitaet/fachbereiche-faecher/fachbereich-ii/faecher/germanistik/professurenfachteile/germanistische-linguistik/professoren/prof-dr-martin-wengeler/kontroverse-diskurse/individium-gesellschaft)")
+st.write("Indicators and causes in explicit attributions of causality.")
 # Create tabs
+tab1, tab2, tab3, tab4, tab5 = st.tabs(["Model", "Indicators", "Causes", "Scatter", "Sankey"])
 # Prompt Tab
 with tab1:
     sentences_input = st.text_area("*Sentences (one per line)*", "\n".join([
+        "Autos stehen im Verdacht, Waldsterben verursacht zu haben.",
         "Fußball führt zu Waldschäden.",
         "Haustüren tragen zum Betonsterben bei.",
     ]), placeholder="German only (currently)")
             annotated_text(*annotations)
             st.write("---")
+# Indicator Tab
 with tab2:
+    selected_chart_type = st.radio(
+        "Type",
+        options=['Total', 'Year', 'Individual'],
+        horizontal=True,
+    )
+    # Display the chart in a container
+    with st.container():
+        if selected_chart_type == 'Individual':
+            # Retrieve slider value from session state or use default
+            individual_threshold = st.session_state.get("individual_threshold", default_indicator_threshold)
+            fig = plot.get_indicator_chart(chart_type=selected_chart_type.lower(), individual_threshold=individual_threshold)
+        else:
+            fig = plot.get_indicator_chart(chart_type=selected_chart_type.lower())
+        st.plotly_chart(fig, use_container_width=True)
+    # Display the slider below the chart container for 'Individual' type
+    if selected_chart_type == 'Individual':
+        with st.container():
+            individual_threshold = st.slider(
+                "Indicator >=",
+                min_value=1,
+                max_value=95,
+                value=default_indicator_threshold,
+                key="individual_threshold"
+            )
+# Causes Tab
 with tab3:
+    # Create a container for the chart and place the slider below it
+    with st.container():
+        # Display the chart first
+        fig_causes = plot.get_causes_chart(min_value=st.session_state.get("cause_threshold_causes", default_cause_threshold))
+        st.plotly_chart(fig_causes, use_container_width=True)
+        # Place the slider below the chart with a unique key
+        cause_threshold_causes = st.slider(
+            "Cause >=", min_value=1, max_value=75, value=default_cause_threshold, key="cause_threshold_causes"
+        )
+# Scatter Tab
 with tab4:
+    fig_scatter = plot.scatter()
     st.plotly_chart(fig_scatter, use_container_width=True)
+# Sankey Tab
 with tab5:
     with st.container():
+        # Use the unique Sankey threshold variables in session state
+        cause_threshold_sankey = st.session_state.get("cause_threshold_sankey", default_cause_threshold_sankey)
+        indicator_threshold_sankey = st.session_state.get("indicator_threshold_sankey", default_indicator_threshold_sankey)
+        # Generate the Sankey diagram with the new Sankey-specific thresholds
+        fig_sankey = plot.sankey(cause_threshold=cause_threshold_sankey, indicator_threshold=indicator_threshold_sankey)
         st.plotly_chart(fig_sankey, use_container_width=True)
+    # Place sliders below the chart container with unique keys for the Sankey tab
     with st.container():
+        cause_threshold_sankey = st.slider(
+            "Cause >=", min_value=1, max_value=100, value=default_cause_threshold_sankey, key="cause_threshold_sankey"
+        )
+        indicator_threshold_sankey = st.slider(
+            "Indicator >=", min_value=1, max_value=100, value=default_indicator_threshold_sankey, key="indicator_threshold_sankey"
+        )

data/feature_matrix.tsv CHANGED Viewed

The diff for this file is too large to render. See raw diff

plot.py CHANGED Viewed

@@ -3,259 +3,286 @@ import plotly.express as px
 import plotly.graph_objects as go
 import os
 import umap
-def indicator_chart(chart_type='overall'):
-    data_file = os.path.join('data', 'indicator_overview.tsv')
-    df = pd.read_csv(data_file, sep='\t')
-    if chart_type == 'overall':
-        df_filtered = df[df['Indicator'] == 'Total with Indicators'].copy()
-        total_sentences_per_subfolder = df.groupby('Subfolder')['Total Sentences'].first().to_dict()
-        df_filtered['Total Sentences'] = df_filtered['Subfolder'].map(total_sentences_per_subfolder)
-        df_filtered['Indicator_Share'] = df_filtered['Count'] / df_filtered['Total Sentences']
-        df_filtered['Indicator_Share_Text'] = (df_filtered['Indicator_Share'] * 100).round(2).astype(str) + '%'
-        fig = px.bar(
-            df_filtered,
-            x='Subfolder',
-            y='Indicator_Share',
-            labels={'Indicator_Share': 'Share of Sentences with Indicators', 'Subfolder': ''},
-            color='Subfolder',
-            text='Indicator_Share_Text',
-            color_discrete_sequence=px.colors.qualitative.D3,
-            custom_data=['Total Sentences', 'Count']
-        )
-        fig.update_traces(
-            hovertemplate=(
-                '<b>%{x}</b><br>' +
-                'Share with Indicators: %{y:.1%}<br>' +
-                'Total Sentences: %{customdata[0]}<br>' +
-                'Sentences with Indicators: %{customdata[1]}<extra></extra>'
-            ),
-            textposition='inside',
-            texttemplate='%{text}',
-            textfont=dict(color='rgb(255, 255, 255)'),
-            insidetextanchor='middle',
         )
-    elif chart_type == 'individual':
-        min_value = 5
-        exclude_indicators = ['!besprechen']
-        df_filtered = df[~df['Indicator'].isin(['Total with Indicators', 'None'] + exclude_indicators)].copy()
-        indicators_meeting_threshold = df_filtered[df_filtered['Count'] >= min_value]['Indicator'].unique()
-        df_filtered = df_filtered[df_filtered['Indicator'].isin(indicators_meeting_threshold)]
-        df_filtered['Indicator'] = df_filtered['Indicator'].str.capitalize()
         fig = px.bar(
-            df_filtered,
-            x='Subfolder',
             y='Count',
-            color='Indicator',
             barmode='group',
-            labels={'Count': 'Occurrences', 'Subfolder': '', 'Indicator': '  <b>INDICATOR</b>'},
             color_discrete_sequence=px.colors.qualitative.D3
         )
         fig.update_traces(
             texttemplate='%{y}',
             textposition='inside',
-            textfont=dict(color='rgb(255, 255, 255)'),
-            insidetextanchor='middle'
         )
-    fig.update_layout(
-        xaxis=dict(showline=True),
-        yaxis=dict(showticklabels=True, title='', tickformat=".0%" if chart_type == 'overall' else None),
-        bargap=0.05,
-        showlegend=(chart_type == 'individual')
-    )
-    return fig
-def causes_chart():
-    data_file = os.path.join('data', 'indicator_cause_sentence_metadata.tsv')
-    df = pd.read_csv(data_file, sep='\t')
-    # Threshold
-    min_value = 30
-    df_filtered = df[df['cause'] != 'N/A'].copy()
-    causes_meeting_threshold = df_filtered.groupby('cause')['cause'].count()[lambda x: x >= min_value].index
-    df_filtered = df_filtered[df_filtered['cause'].isin(causes_meeting_threshold)]
-    df_filtered['cause'] = df_filtered['cause'].str.capitalize()
-    fig = px.bar(
-        df_filtered.groupby(['subfolder', 'cause']).size().reset_index(name='Count'),
-        x='subfolder',
-        y='Count',
-        color='cause',
-        barmode='group',
-        labels={'Count': 'Occurrences', 'subfolder': '', 'cause': '<b>CAUSE</b>'},
-        color_discrete_sequence=px.colors.qualitative.D3,
-    )
-    fig.update_layout(
-        xaxis=dict(showline=True),
-        yaxis=dict(showticklabels=True, title=''),
-    )
-    fig.update_traces(
-        texttemplate='%{y}',
-        textposition='inside',
-        textfont=dict(color='rgb(255, 255, 255)'),
-        insidetextanchor='middle',
-    )
-    return fig
-def scatter(include_modality=False):
-    data_file = os.path.join('data', 'feature_matrix.tsv')
-    df = pd.read_csv(data_file, sep='\t')
-    # Exclude sentences without any indicators, causes, or modalities (if included)
-    indicator_columns = [col for col in df.columns if col.startswith('indicator_')]
-    cause_columns = [col for col in df.columns if col.startswith('cause_')]
-    modality_columns = [col for col in df.columns if col.startswith('modality_')]
-    df_filtered = df[(df[indicator_columns].sum(axis=1) > 0) |
-                     (df[cause_columns].sum(axis=1) > 0)]
-    # Exclude indicator '!besprechen'
-    indicator_columns = [col for col in indicator_columns if 'indicator_!besprechen' not in col]
-    # Limit indicators to those that occur at least 10 times
-    indicator_counts = df_filtered[indicator_columns].sum()
-    indicators_to_keep = indicator_counts[indicator_counts >= 10].index.tolist()
-    # Further filter to exclude entries without any valid indicators
-    df_filtered = df_filtered[df_filtered[indicators_to_keep].sum(axis=1) > 0]
-    # Exclude non-feature columns for dimensionality reduction
-    columns_to_drop = ['subfolder']
-    if not include_modality:
-        columns_to_drop += modality_columns  # Drop modality columns if not included
-    features = df_filtered.drop(columns=columns_to_drop)
-    features_clean = features.fillna(0)
-    # Prepare metadata
-    metadata = df_filtered[['subfolder']].copy()
-    metadata['indicator'] = df_filtered[indicators_to_keep].apply(lambda row: ', '.join([indicator.replace('indicator_', '') for indicator in indicators_to_keep if row[indicator] > 0]), axis=1)
-    metadata['cause'] = df_filtered[cause_columns].apply(lambda row: ', '.join([cause.replace('cause_', '') for cause in cause_columns if row[cause] > 0]), axis=1)
-    # UMAP dimensionality reduction
-    reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=50, n_jobs=1, metric='cosine')
-    reduced_features = reducer.fit_transform(features_clean)
-    df_reduced = pd.DataFrame(reduced_features, columns=['Component 1', 'Component 2'])
-    df_reduced = pd.concat([df_reduced, metadata.reset_index(drop=True)], axis=1)
-    # Plotting the scatter plot
-    hover_data = {'cause': True, 'Component 1': False, 'Component 2': False}
-    if include_modality:
-        hover_data['Modality'] = True
-    custom_labels = {
-    'subfolder': 'Effect',      # Renaming 'subfolder' to 'Category'
-    }
-    fig = px.scatter(
-        df_reduced,
-        x='Component 1',
-        y='Component 2',
-        color='subfolder',  # Only subfolder colors will show in the legend
-        symbol='indicator',  # Symbols for indicators, without showing in legend
-        labels=custom_labels,
-        hover_data=hover_data,
-        color_discrete_sequence=px.colors.qualitative.D3
-    )
-    fig.update_layout(
-        xaxis=dict(showgrid=True),
-        yaxis=dict(showgrid=True),
-        showlegend=True,  # Show only the subfolder legend
-        legend=dict(
-            title="Effect, Indicator",  # Adjust title to indicate the subfolder legend
-            yanchor="top",
-            xanchor="left",
-            borderwidth=1,
-        ),
-    )
-    return fig
-def sankey(cause_threshold=10, indicator_threshold=5):
-    # Load the data
-    data_file = os.path.join('data', 'indicator_cause_sentence_metadata.tsv')
-    df = pd.read_csv(data_file, sep='\t')
-    # Remove rows with NaN values in 'cause', 'indicator', or 'subfolder' columns
-    df = df.dropna(subset=['cause', 'indicator', 'subfolder'])
-    # Strip '_nk' from 'subfolder' values
-    df['subfolder'] = df['subfolder'].str.replace('_nk', '')
-    # Calculate overall counts for each cause and indicator
-    cause_counts = df['cause'].value_counts()
-    indicator_counts = df['indicator'].value_counts()
-    # Filter causes and indicators that meet their respective thresholds
-    valid_causes = cause_counts[cause_counts >= cause_threshold].index
-    valid_indicators = indicator_counts[indicator_counts >= indicator_threshold].index
-    # Filter the DataFrame to include only rows with causes and indicators that meet the thresholds
-    df_filtered = df[(df['cause'].isin(valid_causes)) & (df['indicator'].isin(valid_indicators))]
-    # Calculate pair counts for cause -> indicator and indicator -> subfolder
-    cause_indicator_counts = df_filtered.groupby(['cause', 'indicator']).size().reset_index(name='count')
-    indicator_subfolder_counts = df_filtered.groupby(['indicator', 'subfolder']).size().reset_index(name='count')
-    # Generate unique labels for Sankey nodes, including all causes, indicators, and subfolders
-    causes = df_filtered['cause'].unique()
-    indicators = df_filtered['indicator'].unique()
-    subfolders = df_filtered['subfolder'].unique()
-    all_labels = list(causes) + list(indicators) + list(subfolders)
-    # Mapping of each label to an index for Sankey node
-    label_to_index = {label: idx for idx, label in enumerate(all_labels)}
-    # Define sources, targets, and values for the Sankey diagram
-    sources = []
-    targets = []
-    values = []
-    # Add cause -> indicator links
-    for _, row in cause_indicator_counts.iterrows():
-        if row['cause'] in label_to_index and row['indicator'] in label_to_index:
-            sources.append(label_to_index[row['cause']])
-            targets.append(label_to_index[row['indicator']])
-            values.append(row['count'])
-    # Add indicator -> subfolder links
-    for _, row in indicator_subfolder_counts.iterrows():
-        if row['indicator'] in label_to_index and row['subfolder'] in label_to_index:
-            sources.append(label_to_index[row['indicator']])
-            targets.append(label_to_index[row['subfolder']])
-            values.append(row['count'])
-    fig = go.Figure(data=[go.Sankey(
-        node=dict(
-            pad=15,
-            thickness=20,
-            line=dict(color="black", width=0.5),
-            label=all_labels,
-        ),
-        link=dict(
-            source=sources,
-            target=targets,
-            value=values
         )
-    )])
-    fig.update_layout(
-        autosize=False,   # Disable automatic resizing
-        width=500,        # Fixed width
-        height=500,       # Fixed height
-    )
-    return fig

 import plotly.graph_objects as go
 import os
 import umap
+import streamlit as st
+@st.cache_data
+def load_data(file_path):
+    return pd.read_csv(file_path, sep='\t')
+class Plot:
+    def __init__(self, data_file='data/feature_matrix.tsv', metadata_file='data/indicator_cause_sentence_metadata.tsv'):
+        self.data_file = data_file
+        self.metadata_file = metadata_file
+        self.df = load_data(self.data_file)  # Cached data loading
+        self.metadata_df = load_data(self.metadata_file)
+        # Cache and compute necessary columns once
+        self.indicator_columns = [col for col in self.df.columns if col.startswith('indicator_')]
+        self.cause_columns = [col for col in self.df.columns if col.startswith('cause_')]
+        self.df['Year'] = self.df['text_date'].astype(str).str[:4]
+        self.df['Has_Indicator'] = self.df[self.indicator_columns].sum(axis=1) > 0
+        # Precompute totals for faster use in chart functions
+        self.total_sentences_per_year = self.df.groupby(['Year', 'subfolder']).size().reset_index(name='Total Sentences')
+        self.total_sentences_per_subfolder = self.df.groupby('subfolder').size().reset_index(name='Total Sentences')
+    def get_indicator_chart(self, chart_type='total', individual_threshold=5):
+        if chart_type == 'total':
+            # Summarize indicator share per subfolder
+            indicator_counts = self.df[self.df['Has_Indicator']].groupby('subfolder').size().reset_index(name='Indicator Count')
+            total_counts = indicator_counts.merge(self.total_sentences_per_subfolder, on='subfolder')
+            total_counts['Indicator_Share'] = total_counts['Indicator Count'] / total_counts['Total Sentences']
+            total_counts['Indicator_Share_Text'] = (total_counts['Indicator_Share'] * 100).round(2).astype(str) + '%'
+            fig = px.bar(
+                total_counts,
+                x='subfolder',
+                y='Indicator_Share',
+                labels={'Indicator_Share': 'Share of Sentences with Indicators', 'subfolder': ''},
+                color='subfolder',
+                text='Indicator_Share_Text',
+                color_discrete_sequence=px.colors.qualitative.D3
+            )
+            fig.update_traces(
+                textposition='inside',
+                texttemplate='%{text}',
+                textfont=dict(color='rgb(255, 255, 255)')
+            )
+        elif chart_type == 'individual':
+            # Melt the dataframe to long format
+            df_melted = self.df.melt(id_vars=['subfolder'], value_vars=self.indicator_columns, var_name='Indicator', value_name='Count')
+            df_melted = df_melted[df_melted['Count'] > 0]
+            # Group by Indicator only to calculate total counts across all subfolders
+            total_indicator_counts = df_melted.groupby('Indicator').size().reset_index(name='Total Count')
+            indicators_meeting_threshold = total_indicator_counts[total_indicator_counts['Total Count'] >= individual_threshold]['Indicator'].unique()
+            # Filter df_melted to include only indicators that meet the threshold overall
+            df_melted = df_melted[df_melted['Indicator'].isin(indicators_meeting_threshold)]
+            df_melted['Indicator'] = df_melted['Indicator'].str.replace('indicator_', '').str.capitalize()
+            # Re-aggregate counts by subfolder and indicator for the filtered indicators
+            df_melted = df_melted.groupby(['subfolder', 'Indicator']).size().reset_index(name='Count')
+            # Create the bar chart
+            fig = px.bar(
+                df_melted,
+                x='subfolder',
+                y='Count',
+                color='Indicator',
+                barmode='group',
+                labels={'Count': 'Occurrences', 'subfolder': '', 'Indicator': 'Indicator'},
+                color_discrete_sequence=px.colors.qualitative.D3
+            )
+            fig.update_traces(
+                texttemplate='%{y}',
+                textposition='inside',
+                textfont=dict(color='rgb(255, 255, 255)')
+            )
+        elif chart_type == 'year':
+            indicator_counts_per_year = self.df[self.df['Has_Indicator']].groupby(['Year', 'subfolder']).size().reset_index(name='Indicator Count')
+            df_summary = pd.merge(self.total_sentences_per_year, indicator_counts_per_year, on=['Year', 'subfolder'], how='left')
+            df_summary['Indicator_Share_Text'] = (df_summary['Indicator Count'] / df_summary['Total Sentences'] * 100).round(2).astype(str) + '%'
+            fig = px.bar(
+                df_summary,
+                x='Year',
+                y='Total Sentences',
+                color='subfolder',
+                labels={'Total Sentences': 'Total Number of Sentences', 'Year': 'Year'},
+                text='Indicator_Share_Text',
+                color_discrete_sequence=px.colors.qualitative.D3
+            )
+            fig.update_traces(
+                textposition='inside',
+                texttemplate='%{text}',
+                textfont=dict(color='rgb(255, 255, 255)')
+            )
+        fig.update_layout(
+            xaxis=dict(showline=True),
+            yaxis=dict(title='Indicator Sentences' if chart_type != 'year' else 'Total Sentences'),
+            bargap=0.05,
+            showlegend=(chart_type != 'total')
         )
+        return fig
+    def get_causes_chart(self, min_value=30):
+        df_filtered = self.metadata_df[self.metadata_df['cause'] != 'N/A']
+        causes_meeting_threshold = df_filtered.groupby('cause')['cause'].count()[lambda x: x >= min_value].index
+        df_filtered = df_filtered[df_filtered['cause'].isin(causes_meeting_threshold)]
+        df_filtered['cause'] = df_filtered['cause'].str.capitalize()
         fig = px.bar(
+            df_filtered.groupby(['subfolder', 'cause']).size().reset_index(name='Count'),
+            x='subfolder',
             y='Count',
+            color='cause',
             barmode='group',
+            labels={'Count': 'Occurrences', 'subfolder': '', 'cause': 'Cause'},
             color_discrete_sequence=px.colors.qualitative.D3
         )
+        fig.update_layout(xaxis=dict(showline=True), yaxis=dict(showticklabels=True, title=''))
         fig.update_traces(
             texttemplate='%{y}',
             textposition='inside',
+            textfont=dict(color='rgb(255, 255, 255)')
+        )
+        return fig
+    def scatter(self, include_modality=False):
+        # Use self.df to avoid reloading data
+        df_filtered = self.df[(self.df[self.indicator_columns].sum(axis=1) > 0) |
+                              (self.df[self.cause_columns].sum(axis=1) > 0)]
+        # Exclude specific indicators and filter based on count threshold
+        indicator_columns = [col for col in self.indicator_columns if 'indicator_!besprechen' not in col]
+        indicator_counts = df_filtered[indicator_columns].sum()
+        indicators_to_keep = indicator_counts[indicator_counts >= 10].index.tolist()
+        df_filtered = df_filtered[df_filtered[indicators_to_keep].sum(axis=1) > 0]
+        # Exclude non-feature columns for dimensionality reduction
+        columns_to_drop = ['subfolder', 'text_id', 'sentence_id', 'text_date', 'text_source', 'text_text_type']
+        if not include_modality:
+            columns_to_drop += [col for col in self.df.columns if col.startswith('modality_')]
+        features = df_filtered.drop(columns=columns_to_drop, errors='ignore').select_dtypes(include=[float, int])
+        features_clean = features.fillna(0)
+        # Prepare metadata for plotting
+        metadata = df_filtered[['subfolder']].copy()
+        metadata['indicator'] = df_filtered[indicators_to_keep].apply(
+            lambda row: ', '.join([indicator.replace('indicator_', '') for indicator in indicators_to_keep if row[indicator] > 0]),
+            axis=1
+        )
+        metadata['cause'] = df_filtered[self.cause_columns].apply(
+            lambda row: ', '.join([cause.replace('cause_', '') for cause in self.cause_columns if row[cause] > 0]),
+            axis=1
+        )
+        # Perform UMAP dimensionality reduction
+        reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=50, n_jobs=1, metric='cosine')
+        reduced_features = reducer.fit_transform(features_clean)
+        df_reduced = pd.DataFrame(reduced_features, columns=['Component 1', 'Component 2'])
+        df_reduced = pd.concat([df_reduced, metadata.reset_index(drop=True)], axis=1)
+        # Plotting the scatter plot
+        hover_data = {'cause': True, 'Component 1': False, 'Component 2': False}
+        if include_modality:
+            hover_data['Modality'] = True
+        fig = px.scatter(
+            df_reduced,
+            x='Component 1',
+            y='Component 2',
+            color='subfolder',
+            symbol='indicator',
+            labels={'subfolder': 'Effect'},
+            hover_data=hover_data,
+            color_discrete_sequence=px.colors.qualitative.D3
         )
+        fig.update_layout(
+            xaxis=dict(showgrid=True),
+            yaxis=dict(showgrid=True),
+            showlegend=True,
+            legend=dict(title="Effect, Indicator", yanchor="top", xanchor="left", borderwidth=1),
         )
+        return fig
+    def sankey(self, cause_threshold=10, indicator_threshold=5, link_opacity=0.4):
+        # Use self.df to avoid reloading data
+        df_filtered = self.df[(self.df[self.cause_columns].sum(axis=1) > 0) &
+                              (self.df[self.indicator_columns].sum(axis=1) > 0)]
+        # Melt causes and indicators separately, ensuring unique sentence IDs
+        cause_data = df_filtered[['text_id', 'subfolder'] + self.cause_columns].melt(
+            id_vars=['text_id', 'subfolder'], var_name='cause', value_name='count'
+        ).query("count > 0").drop_duplicates(['text_id', 'cause'])
+        indicator_data = df_filtered[['text_id', 'subfolder'] + self.indicator_columns].melt(
+            id_vars=['text_id', 'subfolder'], var_name='indicator', value_name='count'
+        ).query("count > 0").drop_duplicates(['text_id', 'indicator'])
+        # Apply threshold filters
+        valid_causes = cause_data['cause'].value_counts()[lambda x: x >= cause_threshold].index
+        valid_indicators = indicator_data['indicator'].value_counts()[lambda x: x >= indicator_threshold].index
+        cause_data = cause_data[cause_data['cause'].isin(valid_causes)]
+        indicator_data = indicator_data[indicator_data['indicator'].isin(valid_indicators)]
+        # Create unique cause-indicator-subfolder links by merging cause and indicator data on 'text_id' and 'subfolder'
+        cause_indicator_links = (
+            cause_data.merge(indicator_data, on=['text_id', 'subfolder'])
+            .groupby(['cause', 'indicator']).size().reset_index(name='count')
+        )
+        # Aggregate indicator-subfolder counts
+        indicator_subfolder_links = (
+            indicator_data.groupby(['indicator', 'subfolder']).size().reset_index(name='count')
+        )
+        # Define unique labels and their order
+        all_labels = list(valid_causes) + list(valid_indicators) + self.df['subfolder'].unique().tolist()
+        # Remove prefixes for cleaner labels
+        all_labels_cleaned = [label.replace("cause_", "").replace("indicator_", "") for label in all_labels]
+        label_to_index = {label: idx for idx, label in enumerate(all_labels)}
+        # Define a color palette from Plotly's D3 color sequence
+        color_palette = px.colors.qualitative.D3
+        node_colors = [color_palette[i % len(color_palette)] for i in range(len(all_labels))]
+        # Define sources, targets, values, and link colors with RGBA opacity
+        sources, targets, values, link_colors = [], [], [], []
+        def hex_to_rgba(hex_color, opacity):
+            return f'rgba({int(hex_color[1:3], 16)}, {int(hex_color[3:5], 16)}, {int(hex_color[5:], 16)}, {opacity})'
+        # Cause -> Indicator links
+        for _, row in cause_indicator_links.iterrows():
+            if row['cause'] in label_to_index and row['indicator'] in label_to_index:
+                source_idx = label_to_index[row['cause']]
+                target_idx = label_to_index[row['indicator']]
+                sources.append(source_idx)
+                targets.append(target_idx)
+                values.append(row['count'])
+                link_colors.append(hex_to_rgba(node_colors[source_idx], link_opacity))
+        # Indicator -> Subfolder links
+        for _, row in indicator_subfolder_links.iterrows():
+            if row['indicator'] in label_to_index and row['subfolder'] in label_to_index:
+                source_idx = label_to_index[row['indicator']]
+                target_idx = label_to_index[row['subfolder']]
+                sources.append(source_idx)
+                targets.append(target_idx)
+                values.append(row['count'])
+                link_colors.append(hex_to_rgba(node_colors[source_idx], link_opacity))
+        fig = go.Figure(data=[go.Sankey(
+            node=dict(
+                pad=15,
+                thickness=20,
+                line=dict(color="black", width=0.5),
+                label=all_labels_cleaned,
+                color=node_colors
+            ),
+            link=dict(
+                source=sources,
+                target=targets,
+                value=values,
+                color=link_colors
+            )
+        )])
+        fig.update_layout(
+            autosize=False,
+            width=800,
+            height=600,
+            font=dict(size=10)
+        )
+        return fig