import pandas as pd
import plotly.express as px
import os
import umap
from sklearn.preprocessing import StandardScaler

def indicator_chart(chart_type='overall'):
    data_file = os.path.join('data', 'indicator_overview.tsv')
    df = pd.read_csv(data_file, sep='\t')
    
    if chart_type == 'overall':
        df_filtered = df[df['Indicator'] == 'Total with Indicators'].copy()
        total_sentences_per_subfolder = df.groupby('Subfolder')['Total Sentences'].first().to_dict()
        df_filtered['Total Sentences'] = df_filtered['Subfolder'].map(total_sentences_per_subfolder)
        df_filtered['Indicator_Share'] = df_filtered['Count'] / df_filtered['Total Sentences']
        df_filtered['Indicator_Share_Text'] = (df_filtered['Indicator_Share'] * 100).round(2).astype(str) + '%'
        
        fig = px.bar(
            df_filtered, 
            x='Subfolder', 
            y='Indicator_Share', 
            labels={'Indicator_Share': 'Share of Sentences with Indicators', 'Subfolder': ''},
            color='Subfolder', 
            text='Indicator_Share_Text',
            color_discrete_sequence=px.colors.qualitative.D3,
            custom_data=['Total Sentences', 'Count']
        )
        
        fig.update_traces(
            hovertemplate=(
                '<b>%{x}</b><br>' +
                'Share with Indicators: %{y:.1%}<br>' +
                'Total Sentences: %{customdata[0]}<br>' +
                'Sentences with Indicators: %{customdata[1]}<extra></extra>'
            ),
            textposition='inside',
            texttemplate='%{text}',
            textfont=dict(color='rgb(255, 255, 255)'),
            insidetextanchor='middle',
        )

    elif chart_type == 'individual':
        min_value = 5
        exclude_indicators = ['!besprechen']
        df_filtered = df[~df['Indicator'].isin(['Total with Indicators', 'None'] + exclude_indicators)].copy()
        indicators_meeting_threshold = df_filtered[df_filtered['Count'] >= min_value]['Indicator'].unique()
        df_filtered = df_filtered[df_filtered['Indicator'].isin(indicators_meeting_threshold)]
        df_filtered['Indicator'] = df_filtered['Indicator'].str.capitalize()

        fig = px.bar(
            df_filtered,
            x='Subfolder', 
            y='Count', 
            color='Indicator',
            barmode='group',
            labels={'Count': 'Occurrences', 'Subfolder': '', 'Indicator': '  <b>INDICATOR</b>'},
            color_discrete_sequence=px.colors.qualitative.D3
        )

        fig.update_traces(
            texttemplate='%{y}',
            textposition='inside',
            textfont=dict(color='rgb(255, 255, 255)'),
        )

    fig.update_layout(
        xaxis=dict(showline=True),
        yaxis=dict(showticklabels=True, title=''),
        bargap=0.05,
        showlegend=(chart_type == 'individual')
    )
    
    return fig

def causes_chart():
    data_file = os.path.join('data', 'indicator_cause_sentence_metadata.tsv')
    df = pd.read_csv(data_file, sep='\t')

    # Threshold
    min_value = 30
    df_filtered = df[df['cause'] != 'N/A'].copy()
    causes_meeting_threshold = df_filtered.groupby('cause')['cause'].count()[lambda x: x >= min_value].index
    df_filtered = df_filtered[df_filtered['cause'].isin(causes_meeting_threshold)]
    df_filtered['cause'] = df_filtered['cause'].str.capitalize()
    
    fig = px.bar(
        df_filtered.groupby(['subfolder', 'cause']).size().reset_index(name='Count'),
        x='subfolder', 
        y='Count', 
        color='cause',
        barmode='group',
        labels={'Count': 'Occurrences', 'subfolder': '', 'cause': '<b>CAUSE</b>'},
        color_discrete_sequence=px.colors.qualitative.G10,
    )

    fig.update_layout(
        xaxis=dict(showline=True),
        yaxis=dict(showticklabels=True, title=''),

    )

    fig.update_traces(
        texttemplate='%{y}',
        textposition='inside',
        textfont=dict(color='rgb(255, 255, 255)'),
        insidetextanchor='middle',
    )
    
    return fig

def scatter_plot(include_modality=False):
    data_file = os.path.join('data', 'feature_matrix.tsv')
    df = pd.read_csv(data_file, sep='\t')

    # Exclude sentences without any indicators (all indicator columns are 0), causes, or modalities (if included)
    indicator_columns = [col for col in df.columns if col.startswith('indicator_')]
    cause_columns = [col for col in df.columns if col.startswith('cause_')]
    modality_columns = [col for col in df.columns if col.startswith('modality_')]

    df_filtered = df[(df[indicator_columns].sum(axis=1) > 0) | 
                        (df[cause_columns].sum(axis=1) > 0)]

    # Exclude indicator '!besprechen'
    indicator_columns = [col for col in indicator_columns if 'indicator_!besprechen' not in col]

    # Limit indicators to those that occur at least 10 times
    indicator_counts = df_filtered[indicator_columns].sum()
    indicators_to_keep = indicator_counts[indicator_counts >= 10].index.tolist()

    # Further filter to exclude entries without any valid indicators
    df_filtered = df_filtered[df_filtered[indicators_to_keep].sum(axis=1) > 0]

    # Exclude non-feature columns (metadata and sentence text) for dimensionality reduction
    columns_to_drop = ['subfolder']
    if not include_modality:
        columns_to_drop += modality_columns  # Drop modality columns if not included

    features = df_filtered.drop(columns=columns_to_drop)

    # Fill NaN values with 0 for the feature matrix
    features_clean = features.fillna(0)

    # Store the relevant metadata separately to ensure it is aligned correctly with the dimensionality reduction results
    metadata = df_filtered[['subfolder']].copy()
    # Remove the 'indicator_' prefix for indicators and ensure only indicators with at least 10 occurrences are included
    metadata['indicator'] = df_filtered[indicators_to_keep].apply(lambda row: ', '.join([indicator.replace('indicator_', '') for indicator in indicators_to_keep if row[indicator] > 0]), axis=1)
    # Collect all non-zero causes as a string (multiple causes per sentence)
    metadata['cause'] = df_filtered[cause_columns].apply(lambda row: ', '.join([cause.replace('cause_', '') for cause in cause_columns if row[cause] > 0]), axis=1)

    # Perform UMAP dimensionality reduction
    reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=50, metric='cosine')
    reduced_features = reducer.fit_transform(features_clean)
    df_reduced = pd.DataFrame(reduced_features, columns=['Component 1', 'Component 2'])
    df_reduced = pd.concat([df_reduced, metadata.reset_index(drop=True)], axis=1)

    # Plotting the scatter plot with Plotly Express
    hover_data = {'cause'}
    if include_modality:
        hover_data['Modality'] = True

    fig = px.scatter(
        df_reduced,
        x='Component 1',
        y='Component 2',
        color='subfolder',
        hover_data=hover_data,
        labels={'Component 1': 'UMAP Dim 1', 'Component 2': 'UMAP Dim 2'},
        color_discrete_sequence=px.colors.qualitative.Plotly
    )

    fig.update_layout(
        xaxis=dict(showgrid=False),
        yaxis=dict(showgrid=False),
        showlegend=True
    )

    return fig