File size: 10,366 Bytes
45d0933
 
9525dec
45d0933
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b22889
45d0933
 
 
 
9525dec
45d0933
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f0f736
45d0933
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9525dec
45d0933
 
 
9525dec
45d0933
 
 
 
 
9525dec
45d0933
 
 
 
 
 
 
 
 
 
 
9525dec
45d0933
 
 
 
 
 
 
9525dec
45d0933
 
 
 
9525dec
 
45d0933
 
 
 
9525dec
6b22889
45d0933
 
 
6b22889
 
 
 
45d0933
 
 
 
9525dec
 
6b22889
45d0933
4f0f736
45d0933
 
 
6b22889
 
9525dec
 
6b22889
9525dec
 
 
 
45d0933
 
 
 
9525dec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
import umap

def indicator_chart(chart_type='overall'):
    data_file = os.path.join('data', 'indicator_overview.tsv')
    df = pd.read_csv(data_file, sep='\t')
    
    if chart_type == 'overall':
        df_filtered = df[df['Indicator'] == 'Total with Indicators'].copy()
        total_sentences_per_subfolder = df.groupby('Subfolder')['Total Sentences'].first().to_dict()
        df_filtered['Total Sentences'] = df_filtered['Subfolder'].map(total_sentences_per_subfolder)
        df_filtered['Indicator_Share'] = df_filtered['Count'] / df_filtered['Total Sentences']
        df_filtered['Indicator_Share_Text'] = (df_filtered['Indicator_Share'] * 100).round(2).astype(str) + '%'
        
        fig = px.bar(
            df_filtered, 
            x='Subfolder', 
            y='Indicator_Share', 
            labels={'Indicator_Share': 'Share of Sentences with Indicators', 'Subfolder': ''},
            color='Subfolder', 
            text='Indicator_Share_Text',
            color_discrete_sequence=px.colors.qualitative.D3,
            custom_data=['Total Sentences', 'Count']
        )
        
        fig.update_traces(
            hovertemplate=(
                '<b>%{x}</b><br>' +
                'Share with Indicators: %{y:.1%}<br>' +
                'Total Sentences: %{customdata[0]}<br>' +
                'Sentences with Indicators: %{customdata[1]}<extra></extra>'
            ),
            textposition='inside',
            texttemplate='%{text}',
            textfont=dict(color='rgb(255, 255, 255)'),
            insidetextanchor='middle',
        )

    elif chart_type == 'individual':
        min_value = 5
        exclude_indicators = ['!besprechen']
        df_filtered = df[~df['Indicator'].isin(['Total with Indicators', 'None'] + exclude_indicators)].copy()
        indicators_meeting_threshold = df_filtered[df_filtered['Count'] >= min_value]['Indicator'].unique()
        df_filtered = df_filtered[df_filtered['Indicator'].isin(indicators_meeting_threshold)]
        df_filtered['Indicator'] = df_filtered['Indicator'].str.capitalize()

        fig = px.bar(
            df_filtered,
            x='Subfolder', 
            y='Count', 
            color='Indicator',
            barmode='group',
            labels={'Count': 'Occurrences', 'Subfolder': '', 'Indicator': '  <b>INDICATOR</b>'},
            color_discrete_sequence=px.colors.qualitative.D3
        )

        fig.update_traces(
            texttemplate='%{y}',
            textposition='inside',
            textfont=dict(color='rgb(255, 255, 255)'),
            insidetextanchor='middle'
        )

    fig.update_layout(
        xaxis=dict(showline=True),
        yaxis=dict(showticklabels=True, title='', tickformat=".0%" if chart_type == 'overall' else None),
        bargap=0.05,
        showlegend=(chart_type == 'individual')
    )
    
    return fig

def causes_chart():
    data_file = os.path.join('data', 'indicator_cause_sentence_metadata.tsv')
    df = pd.read_csv(data_file, sep='\t')

    # Threshold
    min_value = 30
    df_filtered = df[df['cause'] != 'N/A'].copy()
    causes_meeting_threshold = df_filtered.groupby('cause')['cause'].count()[lambda x: x >= min_value].index
    df_filtered = df_filtered[df_filtered['cause'].isin(causes_meeting_threshold)]
    df_filtered['cause'] = df_filtered['cause'].str.capitalize()
    
    fig = px.bar(
        df_filtered.groupby(['subfolder', 'cause']).size().reset_index(name='Count'),
        x='subfolder', 
        y='Count', 
        color='cause',
        barmode='group',
        labels={'Count': 'Occurrences', 'subfolder': '', 'cause': '<b>CAUSE</b>'},
        color_discrete_sequence=px.colors.qualitative.D3,
    )

    fig.update_layout(
        xaxis=dict(showline=True),
        yaxis=dict(showticklabels=True, title=''),

    )

    fig.update_traces(
        texttemplate='%{y}',
        textposition='inside',
        textfont=dict(color='rgb(255, 255, 255)'),
        insidetextanchor='middle',
    )
    
    return fig

def scatter(include_modality=False):
    data_file = os.path.join('data', 'feature_matrix.tsv')
    df = pd.read_csv(data_file, sep='\t')

    # Exclude sentences without any indicators, causes, or modalities (if included)
    indicator_columns = [col for col in df.columns if col.startswith('indicator_')]
    cause_columns = [col for col in df.columns if col.startswith('cause_')]
    modality_columns = [col for col in df.columns if col.startswith('modality_')]

    df_filtered = df[(df[indicator_columns].sum(axis=1) > 0) | 
                     (df[cause_columns].sum(axis=1) > 0)]

    # Exclude indicator '!besprechen'
    indicator_columns = [col for col in indicator_columns if 'indicator_!besprechen' not in col]

    # Limit indicators to those that occur at least 10 times
    indicator_counts = df_filtered[indicator_columns].sum()
    indicators_to_keep = indicator_counts[indicator_counts >= 10].index.tolist()

    # Further filter to exclude entries without any valid indicators
    df_filtered = df_filtered[df_filtered[indicators_to_keep].sum(axis=1) > 0]

    # Exclude non-feature columns for dimensionality reduction
    columns_to_drop = ['subfolder']
    if not include_modality:
        columns_to_drop += modality_columns  # Drop modality columns if not included

    features = df_filtered.drop(columns=columns_to_drop)
    features_clean = features.fillna(0)

    # Prepare metadata
    metadata = df_filtered[['subfolder']].copy()
    metadata['indicator'] = df_filtered[indicators_to_keep].apply(lambda row: ', '.join([indicator.replace('indicator_', '') for indicator in indicators_to_keep if row[indicator] > 0]), axis=1)
    metadata['cause'] = df_filtered[cause_columns].apply(lambda row: ', '.join([cause.replace('cause_', '') for cause in cause_columns if row[cause] > 0]), axis=1)

    # UMAP dimensionality reduction
    reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=50, n_jobs=1, metric='cosine')
    reduced_features = reducer.fit_transform(features_clean)
    df_reduced = pd.DataFrame(reduced_features, columns=['Component 1', 'Component 2'])
    df_reduced = pd.concat([df_reduced, metadata.reset_index(drop=True)], axis=1)

    # Plotting the scatter plot
    hover_data = {'cause': True, 'Component 1': False, 'Component 2': False}
    if include_modality:
        hover_data['Modality'] = True

    custom_labels = {
    'subfolder': 'Effect',      # Renaming 'subfolder' to 'Category'
    }

    fig = px.scatter(
        df_reduced,
        x='Component 1',
        y='Component 2',
        color='subfolder',  # Only subfolder colors will show in the legend
        symbol='indicator',  # Symbols for indicators, without showing in legend
        labels=custom_labels,
        hover_data=hover_data,
        color_discrete_sequence=px.colors.qualitative.D3
    )

    fig.update_layout(
        xaxis=dict(showgrid=True),
        yaxis=dict(showgrid=True),
        showlegend=True,  # Show only the subfolder legend
        legend=dict(
            title="Effect, Indicator",  # Adjust title to indicate the subfolder legend
            yanchor="top",
            xanchor="left",
            borderwidth=1,
        ),
    )

    return fig

def sankey(cause_threshold=10, indicator_threshold=5):
    # Load the data
    data_file = os.path.join('data', 'indicator_cause_sentence_metadata.tsv')
    df = pd.read_csv(data_file, sep='\t')

    # Remove rows with NaN values in 'cause', 'indicator', or 'subfolder' columns
    df = df.dropna(subset=['cause', 'indicator', 'subfolder'])

    # Strip '_nk' from 'subfolder' values
    df['subfolder'] = df['subfolder'].str.replace('_nk', '')

    # Calculate overall counts for each cause and indicator
    cause_counts = df['cause'].value_counts()
    indicator_counts = df['indicator'].value_counts()

    # Filter causes and indicators that meet their respective thresholds
    valid_causes = cause_counts[cause_counts >= cause_threshold].index
    valid_indicators = indicator_counts[indicator_counts >= indicator_threshold].index

    # Filter the DataFrame to include only rows with causes and indicators that meet the thresholds
    df_filtered = df[(df['cause'].isin(valid_causes)) & (df['indicator'].isin(valid_indicators))]

    # Calculate pair counts for cause -> indicator and indicator -> subfolder
    cause_indicator_counts = df_filtered.groupby(['cause', 'indicator']).size().reset_index(name='count')
    indicator_subfolder_counts = df_filtered.groupby(['indicator', 'subfolder']).size().reset_index(name='count')

    # Generate unique labels for Sankey nodes, including all causes, indicators, and subfolders
    causes = df_filtered['cause'].unique()
    indicators = df_filtered['indicator'].unique()
    subfolders = df_filtered['subfolder'].unique()
    all_labels = list(causes) + list(indicators) + list(subfolders)

    # Mapping of each label to an index for Sankey node
    label_to_index = {label: idx for idx, label in enumerate(all_labels)}

    # Define sources, targets, and values for the Sankey diagram
    sources = []
    targets = []
    values = []

    # Add cause -> indicator links
    for _, row in cause_indicator_counts.iterrows():
        if row['cause'] in label_to_index and row['indicator'] in label_to_index:
            sources.append(label_to_index[row['cause']])
            targets.append(label_to_index[row['indicator']])
            values.append(row['count'])

    # Add indicator -> subfolder links
    for _, row in indicator_subfolder_counts.iterrows():
        if row['indicator'] in label_to_index and row['subfolder'] in label_to_index:
            sources.append(label_to_index[row['indicator']])
            targets.append(label_to_index[row['subfolder']])
            values.append(row['count'])

    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=all_labels,
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values
        )
    )])

    fig.update_layout(
        autosize=False,   # Disable automatic resizing
        width=500,        # Fixed width
        height=500,       # Fixed height
    )

    return fig