import pandas as pd import plotly.express as px import plotly.graph_objects as go import os import umap def indicator_chart(chart_type='overall'): data_file = os.path.join('data', 'indicator_overview.tsv') df = pd.read_csv(data_file, sep='\t') if chart_type == 'overall': df_filtered = df[df['Indicator'] == 'Total with Indicators'].copy() total_sentences_per_subfolder = df.groupby('Subfolder')['Total Sentences'].first().to_dict() df_filtered['Total Sentences'] = df_filtered['Subfolder'].map(total_sentences_per_subfolder) df_filtered['Indicator_Share'] = df_filtered['Count'] / df_filtered['Total Sentences'] df_filtered['Indicator_Share_Text'] = (df_filtered['Indicator_Share'] * 100).round(2).astype(str) + '%' fig = px.bar( df_filtered, x='Subfolder', y='Indicator_Share', labels={'Indicator_Share': 'Share of Sentences with Indicators', 'Subfolder': ''}, color='Subfolder', text='Indicator_Share_Text', color_discrete_sequence=px.colors.qualitative.D3, custom_data=['Total Sentences', 'Count'] ) fig.update_traces( hovertemplate=( '%{x}
' + 'Share with Indicators: %{y:.1%}
' + 'Total Sentences: %{customdata[0]}
' + 'Sentences with Indicators: %{customdata[1]}' ), textposition='inside', texttemplate='%{text}', textfont=dict(color='rgb(255, 255, 255)'), insidetextanchor='middle', ) elif chart_type == 'individual': min_value = 5 exclude_indicators = ['!besprechen'] df_filtered = df[~df['Indicator'].isin(['Total with Indicators', 'None'] + exclude_indicators)].copy() indicators_meeting_threshold = df_filtered[df_filtered['Count'] >= min_value]['Indicator'].unique() df_filtered = df_filtered[df_filtered['Indicator'].isin(indicators_meeting_threshold)] df_filtered['Indicator'] = df_filtered['Indicator'].str.capitalize() fig = px.bar( df_filtered, x='Subfolder', y='Count', color='Indicator', barmode='group', labels={'Count': 'Occurrences', 'Subfolder': '', 'Indicator': ' INDICATOR'}, color_discrete_sequence=px.colors.qualitative.D3 ) fig.update_traces( texttemplate='%{y}', textposition='inside', textfont=dict(color='rgb(255, 255, 255)'), insidetextanchor='middle' ) fig.update_layout( xaxis=dict(showline=True), yaxis=dict(showticklabels=True, title='', tickformat=".0%" if chart_type == 'overall' else None), bargap=0.05, showlegend=(chart_type == 'individual') ) return fig def causes_chart(): data_file = os.path.join('data', 'indicator_cause_sentence_metadata.tsv') df = pd.read_csv(data_file, sep='\t') # Threshold min_value = 30 df_filtered = df[df['cause'] != 'N/A'].copy() causes_meeting_threshold = df_filtered.groupby('cause')['cause'].count()[lambda x: x >= min_value].index df_filtered = df_filtered[df_filtered['cause'].isin(causes_meeting_threshold)] df_filtered['cause'] = df_filtered['cause'].str.capitalize() fig = px.bar( df_filtered.groupby(['subfolder', 'cause']).size().reset_index(name='Count'), x='subfolder', y='Count', color='cause', barmode='group', labels={'Count': 'Occurrences', 'subfolder': '', 'cause': 'CAUSE'}, color_discrete_sequence=px.colors.qualitative.D3, ) fig.update_layout( xaxis=dict(showline=True), yaxis=dict(showticklabels=True, title=''), ) fig.update_traces( texttemplate='%{y}', textposition='inside', textfont=dict(color='rgb(255, 255, 255)'), insidetextanchor='middle', ) return fig def scatter(include_modality=False): data_file = os.path.join('data', 'feature_matrix.tsv') df = pd.read_csv(data_file, sep='\t') # Exclude sentences without any indicators, causes, or modalities (if included) indicator_columns = [col for col in df.columns if col.startswith('indicator_')] cause_columns = [col for col in df.columns if col.startswith('cause_')] modality_columns = [col for col in df.columns if col.startswith('modality_')] df_filtered = df[(df[indicator_columns].sum(axis=1) > 0) | (df[cause_columns].sum(axis=1) > 0)] # Exclude indicator '!besprechen' indicator_columns = [col for col in indicator_columns if 'indicator_!besprechen' not in col] # Limit indicators to those that occur at least 10 times indicator_counts = df_filtered[indicator_columns].sum() indicators_to_keep = indicator_counts[indicator_counts >= 10].index.tolist() # Further filter to exclude entries without any valid indicators df_filtered = df_filtered[df_filtered[indicators_to_keep].sum(axis=1) > 0] # Exclude non-feature columns for dimensionality reduction columns_to_drop = ['subfolder'] if not include_modality: columns_to_drop += modality_columns # Drop modality columns if not included features = df_filtered.drop(columns=columns_to_drop) features_clean = features.fillna(0) # Prepare metadata metadata = df_filtered[['subfolder']].copy() metadata['indicator'] = df_filtered[indicators_to_keep].apply(lambda row: ', '.join([indicator.replace('indicator_', '') for indicator in indicators_to_keep if row[indicator] > 0]), axis=1) metadata['cause'] = df_filtered[cause_columns].apply(lambda row: ', '.join([cause.replace('cause_', '') for cause in cause_columns if row[cause] > 0]), axis=1) # UMAP dimensionality reduction reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=50, n_jobs=1, metric='cosine') reduced_features = reducer.fit_transform(features_clean) df_reduced = pd.DataFrame(reduced_features, columns=['Component 1', 'Component 2']) df_reduced = pd.concat([df_reduced, metadata.reset_index(drop=True)], axis=1) # Plotting the scatter plot hover_data = {'cause': True, 'Component 1': False, 'Component 2': False} if include_modality: hover_data['Modality'] = True custom_labels = { 'subfolder': 'Effect', # Renaming 'subfolder' to 'Category' } fig = px.scatter( df_reduced, x='Component 1', y='Component 2', color='subfolder', # Only subfolder colors will show in the legend symbol='indicator', # Symbols for indicators, without showing in legend labels=custom_labels, hover_data=hover_data, color_discrete_sequence=px.colors.qualitative.D3 ) fig.update_layout( xaxis=dict(showgrid=True), yaxis=dict(showgrid=True), showlegend=True, # Show only the subfolder legend legend=dict( title="Effect, Indicator", # Adjust title to indicate the subfolder legend yanchor="top", xanchor="left", borderwidth=1, ), ) return fig def sankey(cause_threshold=10, indicator_threshold=5): # Load the data data_file = os.path.join('data', 'indicator_cause_sentence_metadata.tsv') df = pd.read_csv(data_file, sep='\t') # Remove rows with NaN values in 'cause', 'indicator', or 'subfolder' columns df = df.dropna(subset=['cause', 'indicator', 'subfolder']) # Strip '_nk' from 'subfolder' values df['subfolder'] = df['subfolder'].str.replace('_nk', '') # Calculate overall counts for each cause and indicator cause_counts = df['cause'].value_counts() indicator_counts = df['indicator'].value_counts() # Filter causes and indicators that meet their respective thresholds valid_causes = cause_counts[cause_counts >= cause_threshold].index valid_indicators = indicator_counts[indicator_counts >= indicator_threshold].index # Filter the DataFrame to include only rows with causes and indicators that meet the thresholds df_filtered = df[(df['cause'].isin(valid_causes)) & (df['indicator'].isin(valid_indicators))] # Calculate pair counts for cause -> indicator and indicator -> subfolder cause_indicator_counts = df_filtered.groupby(['cause', 'indicator']).size().reset_index(name='count') indicator_subfolder_counts = df_filtered.groupby(['indicator', 'subfolder']).size().reset_index(name='count') # Generate unique labels for Sankey nodes, including all causes, indicators, and subfolders causes = df_filtered['cause'].unique() indicators = df_filtered['indicator'].unique() subfolders = df_filtered['subfolder'].unique() all_labels = list(causes) + list(indicators) + list(subfolders) # Mapping of each label to an index for Sankey node label_to_index = {label: idx for idx, label in enumerate(all_labels)} # Define sources, targets, and values for the Sankey diagram sources = [] targets = [] values = [] # Add cause -> indicator links for _, row in cause_indicator_counts.iterrows(): if row['cause'] in label_to_index and row['indicator'] in label_to_index: sources.append(label_to_index[row['cause']]) targets.append(label_to_index[row['indicator']]) values.append(row['count']) # Add indicator -> subfolder links for _, row in indicator_subfolder_counts.iterrows(): if row['indicator'] in label_to_index and row['subfolder'] in label_to_index: sources.append(label_to_index[row['indicator']]) targets.append(label_to_index[row['subfolder']]) values.append(row['count']) fig = go.Figure(data=[go.Sankey( node=dict( pad=15, thickness=20, line=dict(color="black", width=0.5), label=all_labels, ), link=dict( source=sources, target=targets, value=values ) )]) fig.update_layout( autosize=False, # Disable automatic resizing width=500, # Fixed width height=500, # Fixed height ) return fig