import os import logging from io import StringIO from pathlib import Path from collections import OrderedDict import plotly.express as px import gradio as gr import pandas as pd from dotenv import load_dotenv # from PIL import Image import matplotlib.pyplot as plt # import cv2 # import numpy as np import plotly.graph_objects as go import networkx as nx from model import dfg2networkx, discover_process_map, discover_process_map_activities_connections, discover_process_map_variants, view_process_map load_dotenv() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) def get_data(temp_file, case_col, activity_col, timestamp_col, state: dict): # print(f"temp_file: {temp_file}") if isinstance(temp_file, str): # df = pd.read_csv(StringIO(temp_file), parse_dates=[ "Start", "Finish"]) df = pd.read_csv(temp_file, sep=';|,') else: # df = pd.read_csv(temp_file.name, ) # parse_dates=[ "Start", "Finish"] df = pd.read_csv(temp_file.name, sep=';|,') # logger.debug(df.head()) # logger.debug(df.dtypes) state['df'] = df return df, \ gr.Dropdown( choices=list(df.columns), multiselect=False, label="Case", info="選擇 Case ID"), \ gr.Dropdown( choices=list(df.columns), multiselect=False, label="Activity", info="選擇 Activity ID"), \ gr.Dropdown( choices=list(df.columns), multiselect=False, label="Timestamp", info="選擇 Timestamp"), \ state def get_stats(state: dict): df = state.get('df', pd.DataFrame()).copy() summary = pd.DataFrame({ "metric": ["資料筆數", "Case 數量", "Activity 數量", "起始時間", "結束時間"], "value": [ df.shape[0], df['case_id'].nunique(), df['activity'].nunique(), df['timestamp'].min(), df['timestamp'].max() ] }) case_stats = df.groupby( by = ['case_id'], as_index=False ).agg(count = ('activity', len)).reset_index() logger.debug(f"case stats: {case_stats}") case_lead_time = df.groupby( by = ['case_id'], as_index=False ).agg( duration = ('timestamp', lambda x: (x.max() - x.min()).total_seconds()//3600 )).reset_index() def avg_duration(x): return pd.Series({ "avg_duration": (x.timestamp.max() - x.timestamp.min()).total_seconds()//3600}) case_avg_duration = df.groupby( by = ['case_id'], as_index=False ).apply( avg_duration ) logger.debug(f"case lead time: {case_lead_time}") return ( summary, gr.BarPlot( case_stats, x="case_id", y="count", title="Case Stats", tooltip = ["case_id", "count"], width=None), gr.BarPlot( case_lead_time, x="case_id", y="duration", title="Case Lead Time", tooltip = ["case_id", "duration"], width=None), gr.BarPlot( case_avg_duration, x="case_id", y="avg_duration", title="Case Average Duration", tooltip = ["case_id", "avg_duration"], width=None), state ) def get_process_map( state: dict = {}): df = state.get('df', pd.DataFrame()).copy() net, img = discover_process_map( df, type='petrinet') return img, state def get_process_map_variants( top_k: int = 1, state: dict = {}): """ """ df = state.get('df', pd.DataFrame()).copy() dfg, start_activities, end_activities = discover_process_map_variants( df, top_k, type='dfg') top_variant_connections = OrderedDict(sorted(dfg.items(), key=lambda item: item[1], reverse=True)) state['top_variant_connections'] = top_variant_connections if 'top_variant' not in state and top_k == 1: state['top_variant'] = {'dfg': dfg, 'start_activities': start_activities, 'end_activities': end_activities} nx_graph = dfg2networkx( dfg, start_activities, end_activities) chart = view_process_map( nx_graph, process_type='dfg', layout_type='sfdp') return chart, state def get_process_map_activities_connections( activity_rank: int = 0, connection_rank: int = 0, state: dict = {}): """ """ df = state.get('df', pd.DataFrame()).copy() dfg, start_activities, end_activities = discover_process_map_activities_connections( df, activity_rank = activity_rank, connection_rank = connection_rank, state = state) nx_graph = dfg2networkx( dfg, start_activities, end_activities) chart = view_process_map( nx_graph, process_type='dfg', layout_type='sfdp') return chart, state def etl( case_col, activity_col, timestamp_col, state: dict): """ Argument Return """ df = state['df'].copy() df.loc[:, case_col] = df[case_col].astype(str) df.loc[:, activity_col] = df[activity_col].astype(str) df.loc[:, timestamp_col] = pd.to_datetime(df[timestamp_col]) # format='%Y-%m-%d %H:%M:%S' df.rename(columns={case_col: 'case_id', activity_col: 'activity', timestamp_col: 'timestamp'}, inplace=True) state['df'] = df return df, state ## --- block --- ## css = """ h1 { text-align: center; display:block; } """ demo = gr.Blocks(css = css) with demo: gr.Markdown("# 🌟 Process Discovery 🌟") state = gr.State(value={}) with gr.Row(): upl_btn = gr.UploadButton(label="Upload", file_types = ['.csv'], file_count = "single") with gr.Accordion('Data Preview'): df = gr.Dataframe() with gr.Row(): case_col = gr.Dropdown( multiselect=False, label="Case", info="選擇 Case ID") activity_col = gr.Dropdown( multiselect=False, label="Activity", info="選擇 Activity ID") timestamp_col = gr.Dropdown( multiselect=False, label="Timestamp", info="選擇 Timestamp") upl_btn.upload( fn=get_data, inputs = [upl_btn, case_col, activity_col, timestamp_col, state], outputs=[df, case_col, activity_col, timestamp_col, state]) column_btn = gr.Button("Select Columns") column_btn.click( fn=etl, inputs = [ case_col, activity_col, timestamp_col, state], outputs=[df, state]) with gr.Row(): with gr.Tab('Data Explorer'): # outputs.append(gr.Dataframe( label="Event logs")) de_btn = gr.Button("Get Stats") with gr.Row(): summary = gr.Dataframe( label="Summary", interactive=False, height=300) chart1 = gr.BarPlot( label="Case Stats") chart2 = gr.BarPlot( label="Case Lead Time Stats") chart3 = gr.BarPlot( label="Case Average Activity Time Stats") de_btn.click( fn=get_stats, inputs = [state], outputs=[ summary, chart1, chart2, chart3, state]) with gr.Tab('Variant Explorer'): ve_btn = gr.Button("Get Variants") top_k_variant_selector = gr.Slider(0, 10, value=1, step=1, label="Top-K", info="選擇 Variant 數量(0: 全選)") pmchart = gr.Plot( label="Process Map") ve_btn.click( fn=get_process_map_variants, inputs = [ top_k_variant_selector, state], outputs=[ pmchart, state]) # with gr.Tab('Process Explorer'): # pe_btn = gr.Button("Get Activities & Connections") # with gr.Column(): # top_k_activity_selector = gr.Slider(0, 10, value=1, step=1, label="Activity", info="【pending】增減 Top Activity 數量(0: 全選)") # top_k_connection_selector = gr.Slider(0, 10, value=1, step=1, label="Connection", info="增減 Top Connection 數量(0: 全選)") # pmchart = gr.Plot( label="Process Map") # pe_btn.click( fn=get_process_map_activities_connections, inputs = [ top_k_activity_selector, top_k_connection_selector, state], outputs=[ pmchart, state]) # with gr.Tab('Process Model'): # cc_btn = gr.Button("Get Process Model") # img = gr.Image( label="Process Model") # cc_btn.click( fn=get_process_map, inputs = [state], outputs=[ img, state]) if __name__ == "__main__": demo.launch( # share=True, server_name="0.0.0.0", server_port=int(os.environ.get("PORT")), auth=( os.environ.get("USER_NAME"), os.environ.get("PASSWORD")) )