|
import pandas as pd |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
import os |
|
import umap |
|
|
|
def indicator_chart(chart_type='overall'): |
|
data_file = os.path.join('data', 'indicator_overview.tsv') |
|
df = pd.read_csv(data_file, sep='\t') |
|
|
|
if chart_type == 'overall': |
|
df_filtered = df[df['Indicator'] == 'Total with Indicators'].copy() |
|
total_sentences_per_subfolder = df.groupby('Subfolder')['Total Sentences'].first().to_dict() |
|
df_filtered['Total Sentences'] = df_filtered['Subfolder'].map(total_sentences_per_subfolder) |
|
df_filtered['Indicator_Share'] = df_filtered['Count'] / df_filtered['Total Sentences'] |
|
df_filtered['Indicator_Share_Text'] = (df_filtered['Indicator_Share'] * 100).round(2).astype(str) + '%' |
|
|
|
fig = px.bar( |
|
df_filtered, |
|
x='Subfolder', |
|
y='Indicator_Share', |
|
labels={'Indicator_Share': 'Share of Sentences with Indicators', 'Subfolder': ''}, |
|
color='Subfolder', |
|
text='Indicator_Share_Text', |
|
color_discrete_sequence=px.colors.qualitative.D3, |
|
custom_data=['Total Sentences', 'Count'] |
|
) |
|
|
|
fig.update_traces( |
|
hovertemplate=( |
|
'<b>%{x}</b><br>' + |
|
'Share with Indicators: %{y:.1%}<br>' + |
|
'Total Sentences: %{customdata[0]}<br>' + |
|
'Sentences with Indicators: %{customdata[1]}<extra></extra>' |
|
), |
|
textposition='inside', |
|
texttemplate='%{text}', |
|
textfont=dict(color='rgb(255, 255, 255)'), |
|
insidetextanchor='middle', |
|
) |
|
|
|
elif chart_type == 'individual': |
|
min_value = 5 |
|
exclude_indicators = ['!besprechen'] |
|
df_filtered = df[~df['Indicator'].isin(['Total with Indicators', 'None'] + exclude_indicators)].copy() |
|
indicators_meeting_threshold = df_filtered[df_filtered['Count'] >= min_value]['Indicator'].unique() |
|
df_filtered = df_filtered[df_filtered['Indicator'].isin(indicators_meeting_threshold)] |
|
df_filtered['Indicator'] = df_filtered['Indicator'].str.capitalize() |
|
|
|
fig = px.bar( |
|
df_filtered, |
|
x='Subfolder', |
|
y='Count', |
|
color='Indicator', |
|
barmode='group', |
|
labels={'Count': 'Occurrences', 'Subfolder': '', 'Indicator': ' <b>INDICATOR</b>'}, |
|
color_discrete_sequence=px.colors.qualitative.D3 |
|
) |
|
|
|
fig.update_traces( |
|
texttemplate='%{y}', |
|
textposition='inside', |
|
textfont=dict(color='rgb(255, 255, 255)'), |
|
insidetextanchor='middle' |
|
) |
|
|
|
fig.update_layout( |
|
xaxis=dict(showline=True), |
|
yaxis=dict(showticklabels=True, title='', tickformat=".0%" if chart_type == 'overall' else None), |
|
bargap=0.05, |
|
showlegend=(chart_type == 'individual') |
|
) |
|
|
|
return fig |
|
|
|
def causes_chart(): |
|
data_file = os.path.join('data', 'indicator_cause_sentence_metadata.tsv') |
|
df = pd.read_csv(data_file, sep='\t') |
|
|
|
|
|
min_value = 30 |
|
df_filtered = df[df['cause'] != 'N/A'].copy() |
|
causes_meeting_threshold = df_filtered.groupby('cause')['cause'].count()[lambda x: x >= min_value].index |
|
df_filtered = df_filtered[df_filtered['cause'].isin(causes_meeting_threshold)] |
|
df_filtered['cause'] = df_filtered['cause'].str.capitalize() |
|
|
|
fig = px.bar( |
|
df_filtered.groupby(['subfolder', 'cause']).size().reset_index(name='Count'), |
|
x='subfolder', |
|
y='Count', |
|
color='cause', |
|
barmode='group', |
|
labels={'Count': 'Occurrences', 'subfolder': '', 'cause': '<b>CAUSE</b>'}, |
|
color_discrete_sequence=px.colors.qualitative.D3, |
|
) |
|
|
|
fig.update_layout( |
|
xaxis=dict(showline=True), |
|
yaxis=dict(showticklabels=True, title=''), |
|
|
|
) |
|
|
|
fig.update_traces( |
|
texttemplate='%{y}', |
|
textposition='inside', |
|
textfont=dict(color='rgb(255, 255, 255)'), |
|
insidetextanchor='middle', |
|
) |
|
|
|
return fig |
|
|
|
def scatter(include_modality=False): |
|
data_file = os.path.join('data', 'feature_matrix.tsv') |
|
df = pd.read_csv(data_file, sep='\t') |
|
|
|
|
|
indicator_columns = [col for col in df.columns if col.startswith('indicator_')] |
|
cause_columns = [col for col in df.columns if col.startswith('cause_')] |
|
modality_columns = [col for col in df.columns if col.startswith('modality_')] |
|
|
|
df_filtered = df[(df[indicator_columns].sum(axis=1) > 0) | |
|
(df[cause_columns].sum(axis=1) > 0)] |
|
|
|
|
|
indicator_columns = [col for col in indicator_columns if 'indicator_!besprechen' not in col] |
|
|
|
|
|
indicator_counts = df_filtered[indicator_columns].sum() |
|
indicators_to_keep = indicator_counts[indicator_counts >= 10].index.tolist() |
|
|
|
|
|
df_filtered = df_filtered[df_filtered[indicators_to_keep].sum(axis=1) > 0] |
|
|
|
|
|
columns_to_drop = ['subfolder'] |
|
if not include_modality: |
|
columns_to_drop += modality_columns |
|
|
|
features = df_filtered.drop(columns=columns_to_drop) |
|
features_clean = features.fillna(0) |
|
|
|
|
|
metadata = df_filtered[['subfolder']].copy() |
|
metadata['indicator'] = df_filtered[indicators_to_keep].apply(lambda row: ', '.join([indicator.replace('indicator_', '') for indicator in indicators_to_keep if row[indicator] > 0]), axis=1) |
|
metadata['cause'] = df_filtered[cause_columns].apply(lambda row: ', '.join([cause.replace('cause_', '') for cause in cause_columns if row[cause] > 0]), axis=1) |
|
|
|
|
|
reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=50, n_jobs=1, metric='cosine') |
|
reduced_features = reducer.fit_transform(features_clean) |
|
df_reduced = pd.DataFrame(reduced_features, columns=['Component 1', 'Component 2']) |
|
df_reduced = pd.concat([df_reduced, metadata.reset_index(drop=True)], axis=1) |
|
|
|
|
|
hover_data = {'cause': True, 'Component 1': False, 'Component 2': False} |
|
if include_modality: |
|
hover_data['Modality'] = True |
|
|
|
custom_labels = { |
|
'subfolder': 'Effect', |
|
} |
|
|
|
fig = px.scatter( |
|
df_reduced, |
|
x='Component 1', |
|
y='Component 2', |
|
color='subfolder', |
|
symbol='indicator', |
|
labels=custom_labels, |
|
hover_data=hover_data, |
|
color_discrete_sequence=px.colors.qualitative.D3 |
|
) |
|
|
|
fig.update_layout( |
|
xaxis=dict(showgrid=True), |
|
yaxis=dict(showgrid=True), |
|
showlegend=True, |
|
legend=dict( |
|
title="Effect, Indicator", |
|
yanchor="top", |
|
xanchor="left", |
|
borderwidth=1, |
|
), |
|
) |
|
|
|
return fig |
|
|
|
def sankey(cause_threshold=10, indicator_threshold=5): |
|
|
|
data_file = os.path.join('data', 'indicator_cause_sentence_metadata.tsv') |
|
df = pd.read_csv(data_file, sep='\t') |
|
|
|
|
|
df = df.dropna(subset=['cause', 'indicator', 'subfolder']) |
|
|
|
|
|
df['subfolder'] = df['subfolder'].str.replace('_nk', '') |
|
|
|
|
|
cause_counts = df['cause'].value_counts() |
|
indicator_counts = df['indicator'].value_counts() |
|
|
|
|
|
valid_causes = cause_counts[cause_counts >= cause_threshold].index |
|
valid_indicators = indicator_counts[indicator_counts >= indicator_threshold].index |
|
|
|
|
|
df_filtered = df[(df['cause'].isin(valid_causes)) & (df['indicator'].isin(valid_indicators))] |
|
|
|
|
|
cause_indicator_counts = df_filtered.groupby(['cause', 'indicator']).size().reset_index(name='count') |
|
indicator_subfolder_counts = df_filtered.groupby(['indicator', 'subfolder']).size().reset_index(name='count') |
|
|
|
|
|
causes = df_filtered['cause'].unique() |
|
indicators = df_filtered['indicator'].unique() |
|
subfolders = df_filtered['subfolder'].unique() |
|
all_labels = list(causes) + list(indicators) + list(subfolders) |
|
|
|
|
|
label_to_index = {label: idx for idx, label in enumerate(all_labels)} |
|
|
|
|
|
sources = [] |
|
targets = [] |
|
values = [] |
|
|
|
|
|
for _, row in cause_indicator_counts.iterrows(): |
|
if row['cause'] in label_to_index and row['indicator'] in label_to_index: |
|
sources.append(label_to_index[row['cause']]) |
|
targets.append(label_to_index[row['indicator']]) |
|
values.append(row['count']) |
|
|
|
|
|
for _, row in indicator_subfolder_counts.iterrows(): |
|
if row['indicator'] in label_to_index and row['subfolder'] in label_to_index: |
|
sources.append(label_to_index[row['indicator']]) |
|
targets.append(label_to_index[row['subfolder']]) |
|
values.append(row['count']) |
|
|
|
fig = go.Figure(data=[go.Sankey( |
|
node=dict( |
|
pad=15, |
|
thickness=20, |
|
line=dict(color="black", width=0.5), |
|
label=all_labels, |
|
), |
|
link=dict( |
|
source=sources, |
|
target=targets, |
|
value=values |
|
) |
|
)]) |
|
|
|
fig.update_layout( |
|
autosize=False, |
|
width=500, |
|
height=500, |
|
) |
|
|
|
return fig |