import requests import pandas as pd from io import StringIO import streamlit as st import plotly.express as px import plotly.graph_objects as go import numpy as np from statsmodels.tsa.stattools import acf from statsmodels.graphics.tsaplots import plot_acf import matplotlib.pyplot as plt import folium from streamlit_folium import st_folium import seaborn as sns import datetime from entsoe.geo import load_zones import branca import pytz import time from entsoe import EntsoePandasClient import geopandas as gpd tz = pytz.timezone('Europe/Brussels') def load_capacity_csv(path: str) -> dict: """Load installed capacities CSV into a dict: Country -> {tech: value} """ df = pd.read_csv(path, index_col='Country') # Ensure numeric and handle missing df = df.replace({"NaN": np.nan}).astype(float) return df.to_dict(orient='index') # Load installed capacities from CSV files installed_capacities_2024 = load_capacity_csv('installed_capacities_2024.csv') installed_capacities_2025 = load_capacity_csv('installed_capacities_2025.csv') TECHS = ['Solar', 'Wind Offshore', 'Wind Onshore'] #countries = [ 'AT', 'BE', 'NL', 'BG', 'HR', 'CZ', 'DE_LU', 'DK_1', 'DK_2', #'EE', 'FI', 'FR', 'GR', 'HU', 'IT_CALA', 'IT_CNOR', #'IT_CSUD', 'IT_NORD', 'IT_SARD', 'IT_SICI', 'IT_SUD', 'LV', 'LT', #'NO_1', 'NO_2', 'NO_3', 'NO_4', 'NO_5', 'PL', 'PT', 'RO', #'SE_1', 'SE_2', 'SE_3', 'SE_4', 'RS', 'SK', 'SI', 'ES', 'CH', 'ME','IE_SEM','MK','CY','BA','AL','XK'] countries = ['AT', 'BE', 'DE_LU', 'DK_1', 'DK_2', 'FR', 'IT_CALA', 'IT_CNOR', 'IT_CSUD', 'IT_NORD', 'IT_SARD', 'IT_SICI', 'IT_SUD', 'NL', 'PT', 'ES'] def get_time_zone(country_code): tz_map = { 'AL': 'Europe/Tirane', 'AT': 'Europe/Vienna', 'BE': 'Europe/Brussels', 'BA': 'Europe/Sarajevo', 'BG': 'Europe/Sofia', 'HR': 'Europe/Zagreb', 'CY': 'Asia/Nicosia', 'CZ': 'Europe/Prague', 'DE_LU': 'Europe/Berlin', 'DK_1': 'Europe/Copenhagen', 'DK_2': 'Europe/Copenhagen', 'EE': 'Europe/Tallinn', 'FI': 'Europe/Helsinki', 'MK': 'Europe/Skopje', 'FR': 'Europe/Paris', 'GR': 'Europe/Athens', 'HU': 'Europe/Budapest', 'IS': 'Atlantic/Reykjavik', 'IE_SEM': 'Europe/Dublin', 'IT_CALA': 'Europe/Rome', 'IT_CNOR': 'Europe/Rome', 'IT_CSUD': 'Europe/Rome', 'IT_NORD': 'Europe/Rome', 'IT_SARD': 'Europe/Rome', 'IT_SICI': 'Europe/Rome', 'IT_SUD': 'Europe/Rome', 'LV': 'Europe/Riga', 'LT': 'Europe/Vilnius', 'ME': 'Europe/Podgorica', 'NL': 'Europe/Amsterdam', 'NO_1': 'Europe/Oslo', 'NO_2': 'Europe/Oslo', 'NO_3': 'Europe/Oslo', 'NO_4': 'Europe/Oslo', 'NO_5': 'Europe/Oslo', 'PL': 'Europe/Warsaw', 'PT': 'Europe/Lisbon', 'MD': 'Europe/Chisinau', 'RO': 'Europe/Bucharest', 'SE_1': 'Europe/Stockholm', 'SE_2': 'Europe/Stockholm', 'SE_3': 'Europe/Stockholm', 'SE_4': 'Europe/Stockholm', 'RS': 'Europe/Belgrade', 'SK': 'Europe/Bratislava', 'SI': 'Europe/Ljubljana', 'ES': 'Europe/Madrid', 'CH': 'Europe/Zurich', 'XK': 'Europe/Rome' } if country_code in tz_map: return tz_map[country_code] else: raise ValueError(f"Time zone for country code {country_code} is not defined.") def convert_European_time(data, bdz): time_zone = get_time_zone(bdz) data.index = pd.to_datetime(data.index, utc=True) data.index = data.index.tz_convert(time_zone) data.index = data.index.tz_localize(None) return data def filter_dataframe(df): allowed_columns = {"Load_entsoe", "Load_forecast_entsoe", "Solar_entsoe", "Solar_forecast_entsoe", "Wind_onshore_entsoe", "Wind_onshore_forecast_entsoe", "Wind_offshore_entsoe", "Wind_offshore_forecast_entsoe"} return df[[col for col in df.columns if col in allowed_columns]] def load_GitHub(github_token, bdz): file_name=f'{bdz}_Entsoe_UTC.csv' url = f'https://raw.githubusercontent.com/margaridamascarenhas/Transparency_Data/main/{file_name}' headers = {'Authorization': f'token {github_token}'} response = requests.get(url, headers=headers) if response.status_code == 200: csv_content = StringIO(response.text) df = pd.read_csv(csv_content) if 'Date' in df.columns: df['Date'] = pd.to_datetime(df['Date']) # Convert 'Date' column to datetime df.set_index('Date', inplace=True) # Set 'Date' column as the index df=filter_dataframe(df) df=convert_European_time(df, bdz) return df[df.index >= pd.Timestamp('2024-01-01')] else: print(f"Failed to download {file_name}. Status code: {response.status_code}") return None def filter_variable_options(df): all_options = { "Load": ("Load_entsoe", "Load_forecast_entsoe"), "Solar": ("Solar_entsoe", "Solar_forecast_entsoe"), "Wind Onshore": ("Wind_onshore_entsoe", "Wind_onshore_forecast_entsoe"), "Wind Offshore": ("Wind_offshore_entsoe", "Wind_offshore_forecast_entsoe"), } variable_options = {} flagged_columns = [] for key, (col1, col2) in all_options.items(): col1_exists = col1 in df.columns and not df[col1].isna().all() col2_exists = col2 in df.columns and not df[col2].isna().all() if col1_exists and col2_exists: variable_options[key] = (col1, col2) elif not col1_exists and col2_exists: flagged_columns.append(col1) elif col1_exists and not col2_exists: flagged_columns.append(col2) elif not col1_exists and not col2_exists: flagged_columns.append(col1) flagged_columns.append(col2) return variable_options, flagged_columns github_token = st.secrets["GitHub_Token_KUL_Margarida"] #countries = ['IT_CALA', 'IT_CNOR', 'IT_CSUD', 'IT_SARD', 'PT', 'FR'] @st.cache_data(show_spinner=False) def load_all_from_github(github_token, countries, period_key): """Fetch all zonesโ CSVs from GitHub and return as dict.""" data = {} for bdz in countries: df = load_GitHub(github_token, bdz) if df is not None: data[bdz] = df return data # in your main script, _before_ you do the for-loop now = datetime.now(tz) if now.minute >= 15: block_start = now.replace(minute=0, second=0, microsecond=0) else: block_start = (now - timedelta(hours=1)).replace(minute=0, second=0, microsecond=0) # turn it into a simple string key period_key = block_start.strftime("%Y-%m-%d-%H") if github_token: data_dict = load_all_from_github(github_token, countries, period_key) else: st.error("Please enter your GitHub Personal Access Token to proceed.") st.stop() else: print("Please enter your GitHub Personal Access Token to proceed.") col1, col2 = st.columns([5, 2]) with col1: st.title("Transparency++") with col2: upper_space = col2.empty() upper_space = col2.empty() col2_1, col2_2 = st.columns(2) # Create two columns within the right column for side-by-side images with col2_1: st.image("KU_Leuven_logo.png", width=100) # Adjust the path and width as needed with col2_2: st.image("energyville_logo.png", width=100) st.write("**Evaluate and analyze ENTSO-E Transparency Platform data quality, forecast accuracy, and energy trends for ENTSO-E member countries.**") st.sidebar.header('Filters') st.sidebar.subheader("Select Country") st.sidebar.caption("Choose the country for which you want to display data or forecasts.") selection = ['Overall'] + list(countries) selected_country = st.sidebar.selectbox('Select Country', selection) if selected_country != 'Overall': st.sidebar.subheader("Section") st.sidebar.caption("Select the type of information you want to explore.") section = st.sidebar.radio('', ['Data Quality', 'Forecasts Quality', 'Insights'], index=1) else: section = None # No section is shown when "Overall" is selected if selected_country == 'Overall': data = None # You can set data to None or a specific dataset based on your logic section = None # No section selected when "Overall" is chosen else: country_code = selected_country data = data_dict.get(selected_country) if section == 'Data Quality': st.header('Data Quality') # Determine if capacities missing per year caps4 = installed_capacities_2024.get(country_code) caps5 = installed_capacities_2025.get(country_code) st.write( "The table below presents the data quality metrics focusing on the percentage " "of missing values and the occurrence of extreme or nonsensical values for " "the selected country. Additionally, it flags any mismatch between installed " "capacity (NaN or 0) and actual data in the dataset." ) # Determine end of data slice (yesterday 23:59:59) yesterday = datetime.datetime.now(tz).date() - datetime.timedelta(days=1) end_time = pd.Timestamp(yesterday).replace(hour=23, minute=59, second=59) # Filter data data_quality = data[data.index <= end_time] tech_cols = { 'Load': ('Load_entsoe', 'Load_forecast_entsoe'), 'Wind Onshore': ('Wind_onshore_entsoe', 'Wind_onshore_forecast_entsoe'), 'Wind Offshore': ('Wind_offshore_entsoe', 'Wind_offshore_forecast_entsoe'), 'Solar': ('Solar_entsoe', 'Solar_forecast_entsoe'), } skip_cols = [] for tech_key, (act_col, fct_col) in tech_cols.items(): # only proceed if the columns are in the DataFrame if act_col in data_quality.columns and fct_col in data_quality.columns: # get installed capacities for 2024 & 2025 cap4 = caps4.get(tech_key, np.nan) if isinstance(caps4, dict) else np.nan cap5 = caps5.get(tech_key, np.nan) if isinstance(caps5, dict) else np.nan # if both years are missing or zero capacity if (pd.isna(cap4) or cap4 == 0) and (pd.isna(cap5) or cap5 == 0): act = data_quality[act_col] fct = data_quality[fct_col] # check if actual AND forecast are entirely zero or NaN only_zero_or_na = (act.fillna(0) == 0).all() and (fct.fillna(0) == 0).all() if only_zero_or_na: skip_cols += [act_col, fct_col] # drop any columns flagged for skipping (ignore errors if somehow missing) if skip_cols: data_quality = data_quality.drop(columns=skip_cols, errors='ignore') # Compute missing missing_values = data_quality.isna().mean() * 100 missing_values = missing_values.round(2) extreme_values = {} capacity_mismatch = {} neg_counts = {} over_counts = {} cutoff = pd.Timestamp('2025-01-01') # Iterate over columns for col in data_quality.columns: # Identify technology if 'Solar' in col: tech_key = 'Solar' elif 'Wind_onshore' in col: tech_key = 'Wind Onshore' elif 'Wind_offshore' in col: tech_key = 'Wind Offshore' elif 'Load' in col: tech_key = 'Load' else: extreme_values[col] = np.nan capacity_mismatch[col] = np.nan continue series = data_quality[col] # Year masks mask_2024 = series.index < cutoff # Fetch capacity values cap4 = caps4.get(tech_key, np.nan) if isinstance(caps4, dict) else np.nan cap5 = caps5.get(tech_key, np.nan) if isinstance(caps5, dict) else np.nan print('var:',col) print('cap4:',cap4) if tech_key == 'Load': # Negative load extreme_pct = round((series < 0).mean() * 100, 2) mismatch = np.nan else: # Create per-timestamp capacity cap_series = pd.Series( np.where(mask_2024, cap4, cap5), index=series.index ) # Flags neg = series < 0 over = (series > cap_series) & cap_series.notna() nonsense = neg | over extreme_pct = round(nonsense.mean() * 100, 2) # Mismatch: non-zero gen when cap missing or zero # cap4, cap5 are floats or NaN no_cap_2024 = pd.isna(cap4) or (cap4 == 0) no_cap_2025 = pd.isna(cap5) or (cap5 == 0) # check if there's at least one actual non-zero (treat NaN as 0) has_nonzero = (series.fillna(0) != 0).any() if no_cap_2024 and no_cap_2025 and has_nonzero: mismatch = 100.0 else: mismatch = 0.0 extreme_values[col] = extreme_pct capacity_mismatch[col] = mismatch display_extreme = {col: f"{val:.2f}" if not pd.isna(val) else '' for col, val in extreme_values.items()} display_mismatch = {} for col, val in capacity_mismatch.items(): if 'Load' in col: display_mismatch[col] = '-' else: display_mismatch[col] = '๐ฉ' if val == 100.0 else '' # Build and render DataFrame metrics_df = pd.DataFrame({ 'Missing Values (%)': missing_values, 'Extreme/Nonsensical Values (%)': pd.Series(display_extreme), 'Capacity Mismatch Flag': pd.Series(display_mismatch) }) st.dataframe(metrics_df.style.format({ 'Missing Values (%)': '{:.2f}', 'Extreme/Nonsensical Values (%)': '{}' })) st.write('Missing values (%): Percentage of missing values in the dataset',unsafe_allow_html=True) st.write('Extreme/Nonsensical values (%): For Load, this is % of values below 0. For generation, it is negative or out-of-bound (> capacity).',unsafe_allow_html=True) st.write('Capacity Mismatch Flag: Shows "๐ฉ" if installed capacity is `NaN` or `0` but the dataset has non-zero generation. Blank otherwise. For Load columns, it is "-".',unsafe_allow_html=True) elif section == 'Forecasts Quality': st.header('Forecast Quality') # Time series for last 1 week last_week = data.loc[data.index >= (data.index[-1] - pd.Timedelta(days=7))] st.write('The below plot shows the time series of forecasts vs. observations provided by the ENTSO-E Transparency platform from the past week.') variable_options, flagged_columns = filter_variable_options(last_week) # Dropdown to select the variable selected_variable = st.selectbox("Select Variable for Line PLot", list(variable_options.keys())) actual_col, forecast_col = variable_options[selected_variable] x_vals = last_week.index.to_pydatetime().tolist() y_actual = last_week[actual_col].tolist() y_forecast = last_week[forecast_col].tolist() # then plot fig = go.Figure() fig.add_trace(go.Scatter(x=x_vals,y=y_actual,mode="lines",name="Actual")) fig.add_trace(go.Scatter(x=x_vals,y=y_forecast,mode="lines",name="Forecast ENTSO-E")) fig.update_layout(title=f"Forecasts vs Actual for {selected_variable}",xaxis_title="Date",yaxis_title="Value [MW]") st.plotly_chart(fig) # Scatter plots for error distribution st.subheader('Error Distribution') st.write('The below scatter plots show the error distribution of all fields: Solar, Wind and Load.') selected_variable = st.selectbox("Select Variable for Error Distribution", list(variable_options.keys())) # Get the corresponding columns for the selected variable actual_col, forecast_col = variable_options[selected_variable] if forecast_col in data.columns: # grab the two series, drop any NaNs, and align on their common timestamps obs = data[actual_col].dropna() pred = data[forecast_col].dropna() idx = obs.index.intersection(pred.index) obs = obs.loc[idx] pred = pred.loc[idx] # convert to pure Python lists x_vals = obs.tolist() y_vals = pred.tolist() fig = go.Figure() fig.add_trace(go.Scatter(x=x_vals,y=y_vals,mode='markers',name=f'{selected_variable}')) fig.update_layout(title=f'Error Distribution for {selected_variable}',xaxis_title='Observed [MW]',yaxis_title='Forecast ENTSO-E [MW]') st.plotly_chart(fig) st.subheader('Accuracy Metrics (Sorted by rMAE):') date_range = st.date_input( "Select Date Range for Metrics Calculation:", value=(pd.to_datetime("2024-01-01"), pd.to_datetime(pd.Timestamp('today'))) ) if len(date_range) == 2: start_date = pd.Timestamp(date_range[0]) end_date = pd.Timestamp(date_range[1]) else: st.error("Please select a valid date range.") st.stop() output_text = f"The below metrics are calculated from the selected date range from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}. On the right is a radar plot with the rMAE." st.write(output_text) data_metrics = data.loc[start_date:end_date] accuracy_metrics = pd.DataFrame(columns=['MAE', 'RMSE' ,'rMAE'], index=list(variable_options.keys())) for variable in variable_options.keys(): actual_col, forecast_col = variable_options[variable] obs = data_metrics[actual_col] pred = data_metrics[forecast_col] error = pred - obs mae = round(np.mean(np.abs(error)),2) if 'Load' in actual_col: persistence = obs.shift(168) # Weekly persistence else: persistence = obs.shift(24) # Daily persistence # Using the whole year's data for rMAE calculations rmae = round(mae / np.mean(np.abs(obs - persistence)),2) rmse = round(np.sqrt(np.mean((error)**2)), 2) row_label = variable #'Load' if 'Load' in actual_col else 'Solar' if 'Solar' in actual_col else 'Wind Offshore' if 'Wind_offshore' in actual_col else 'Wind Onshore' accuracy_metrics.loc[row_label] = [mae, rmse, rmae] accuracy_metrics.dropna(how='all', inplace=True)# Sort by rMAE (second column) accuracy_metrics.sort_values(by=accuracy_metrics.columns[-1], ascending=True, inplace=True) accuracy_metrics = accuracy_metrics.round(4) col1, col2 = st.columns([1, 1]) with col1: # (optional) some top-margin before the table st.markdown( """ """, unsafe_allow_html=True ) st.dataframe(accuracy_metrics) with col2: # prepare the data rmae_values = accuracy_metrics['rMAE'].tolist() categories = accuracy_metrics.index.tolist() # build the radar fig = go.Figure( go.Scatterpolar( r=rmae_values, theta=categories, fill='toself', name='rMAE' ) ) # ๐ shrink the total size, and give extra left/right margin for your labels fig.update_layout( width=300, # make the whole plot a bit smaller height=300, margin=dict( l=50, # more space on the left for long category names r=60, # and on the right, if needed t=20, b=20 ), polar=dict( angularaxis=dict( tickfont=dict(size=11) # if you want slightly smaller ticks ), radialaxis=dict( visible=True, range=[0, max(rmae_values)*1.2] ) ), showlegend=False ) # wrap in a div so you can still control vertical spacing via CSS st.markdown('