Spaces:

andreamalhera
/

igedi

Running

App Files Files Community

Andrea Maldonado commited on Jun 11, 2024

Commit

3735e7d

1 Parent(s): 3c2100c

Remove unused

Browse files

Files changed (7) hide show

config_files/algorithm/fix_24.json +0 -34
dashboard.py +0 -295
gedi/__init__.py +0 -1
gedi/analyser.py +0 -123
gedi/utils/algorithms/__init__.py +0 -67
gedi/utils/algorithms/tsne.py +0 -69
main.py +0 -1

config_files/algorithm/fix_24.json DELETED Viewed

@@ -1,34 +0,0 @@
-[
-  {
-    "pipeline_step": "event_logs_generation",
-    "output_path":"data/generated",
-    "generator_params": {
-      "objectives": {
-      "normalized_sequence_entropy_linear_forgetting": 0.05,
-      "ratio_top_20_variants": 0.4
-      },
-      "config_space": {
-        "mode": [5, 40],
-        "sequence": [0.01, 1],
-        "choice": [0.01, 1],
-        "parallel": [0.01, 1],
-        "loop": [0.01, 1],
-        "silent": [0.01, 1],
-        "lt_dependency": [0.01, 1],
-        "num_traces": [100, 1001],
-        "duplicate": [0],
-        "or": [0]
-      },
-      "n_trials": 20
-    }
-  },
-  {
-    "pipeline_step": "feature_extraction",
-    "input_path": "data/generated",
-    "feature_params": {"feature_set":["simple_stats", "trace_length", "trace_variant", "activities", "start_activities", "end_activities", "entropies", "complexity"]},
-    "feature_params": {"feature_set":["trace_length"]},
-    "output_path": "output/plots",
-    "real_eventlog_path": "data/log_meta_features.csv",
-    "plot_type": "boxplot"
-  }
-]

dashboard.py DELETED Viewed

@@ -1,295 +0,0 @@
-from copy import deepcopy
-from meta_feature_extraction.simple_stats import simple_stats
-from meta_feature_extraction.trace_length import trace_length
-from meta_feature_extraction.trace_variant import trace_variant
-from meta_feature_extraction.activities import activities
-from meta_feature_extraction.start_activities import start_activities
-from meta_feature_extraction.end_activities import end_activities
-from meta_feature_extraction.entropies import entropies
-from pm4py import discover_petri_net_inductive as inductive_miner
-from pm4py import generate_process_tree
-from pm4py import save_vis_petri_net, save_vis_process_tree
-from pm4py.algo.filtering.log.variants import variants_filter
-from pm4py.algo.simulation.tree_generator import algorithm as tree_generator
-from pm4py.algo.simulation.playout.process_tree import algorithm as playout
-from pm4py.objects.conversion.log import converter as log_converter
-from pm4py.objects.log.exporter.xes import exporter as xes_exporter
-from pm4py.objects.log.importer.xes import importer as xes_importer
-from pm4py.objects.log.util import dataframe_utils
-from pm4py.sim import play_out
-import matplotlib.image as mpimg
-import os
-import pandas as pd
-import streamlit as st
-OUTPUT_PATH = "output"
-SAMPLE_EVENTS = 500
-@st.cache(allow_output_mutation=True)
-def load_from_xes(uploaded_file):
-    bytes_data = uploaded_file.getvalue()
-    log1 = xes_importer.deserialize(bytes_data)
-    get_stats(log1)
-    return log1
-@st.cache
-def load_from_csv(uploaded_file, sep):
-    if uploaded_file is not None:
-        df = pd.read_csv(uploaded_file, sep=sep, index_col=False)
-        return df
-def get_stats(log, save=True):
-    """Returns the statistics of an event log."""
-    num_traces = len(log)
-    num_events = sum([len(c) for c in log])
-    num_utraces = len(variants_filter.get_variants(log))
-    if save:
-        st.session_state["num_traces"] = num_traces
-        st.session_state["num_events"] = num_events
-        st.session_state["num_utraces"] = num_utraces
-    return num_utraces, num_traces, num_events
-#@st.cache
-def df_to_log(df, case_id, activity, timestamp):
-    df.rename(columns={case_id: 'case:concept:name',
-                       activity: 'concept:name',
-                       timestamp: "time:timestamp"}, inplace=True)
-    temp = dataframe_utils.convert_timestamp_columns_in_df(df)
-    #temp = temp.sort_values(timestamp)
-    log = log_converter.apply(temp)
-    return log, 'concept:name', "time:timestamp"
-def read_uploaded_file(uploaded_file):
-    extension = uploaded_file.name.split('.')[-1]
-    log_name = uploaded_file.name.split('.')[-2]
-    st.sidebar.write("Loaded ", extension.upper(), '-File: ', uploaded_file.name)
-    if extension == "xes":
-        event_log = load_from_xes(uploaded_file)
-        log_columns = [*list(event_log[0][0].keys())]
-        convert_button = False
-        case_id = "case:concept:name"
-        activity = "concept:name"
-        timestamp = "time:timestamp"
-        default_act_id = log_columns.index("concept:name")
-        default_tst_id = log_columns.index("time:timestamp")
-        event_df = log_converter.apply(event_log, variant=log_converter.Variants.TO_DATA_FRAME)
-        df_path = OUTPUT_PATH+"/"+log_name+".csv"
-        event_df.to_csv(df_path, sep =";", index=False)
-        return event_log, event_df, case_id, activity
-    elif extension == "csv":
-        sep = st.sidebar.text_input("Columns separator", ";")
-        event_df = load_from_csv(uploaded_file, sep)
-        old_df = deepcopy(event_df)
-        log_columns = event_df.columns
-        case_id = st.sidebar.selectbox("Choose 'case' column:", log_columns)
-        activity = st.sidebar.selectbox("Choose 'activity' column:", log_columns, index=0)
-        timestamp = st.sidebar.selectbox("Choose 'timestamp' column:", log_columns, index=0)
-        convert_button = st.sidebar.button('Confirm selection')
-        if convert_button:
-            temp = deepcopy(event_df)
-            event_log, activity, timestamp = df_to_log(temp, case_id, activity, timestamp)
-            #xes_exporter.apply(event_log, INPUT_XES)
-            log_columns = [*list(event_log[0][0].keys())]
-            st.session_state['log'] = event_log
-            return event_log, event_df, case_id, activity
-def sample_log_traces(complete_log, sample_size):
-    '''
-    Samples random traces out of logs.
-    So that number of events is slightly over SAMPLE_SIZE.
-    :param complete_log: Log extracted from xes
-    '''
-    log_traces = variants_filter.get_variants(complete_log)
-    keys = list(log_traces.keys())
-    sample_traces = {}
-    num_evs = 0
-    while num_evs < sample_size:
-        if len(keys) == 0:
-            break
-        random_trace = keys.pop()
-        sample_traces[random_trace] = log_traces[random_trace]
-        evs = sum([len(case_id) for case_id in sample_traces[random_trace]])
-        num_evs += evs
-    log1 = variants_filter.apply(complete_log, sample_traces)
-    return log1
-def show_process_petrinet(event_log, filter_info, OUTPUT_PATH):
-            OUTPUT_PLOT = f"{OUTPUT_PATH}_{filter_info}".replace(":","").replace(".","")+".png" # OUTPUT_PATH is OUTPUT_PATH+INPUT_FILE
-            try:
-                fig_pt = mpimg.imread(OUTPUT_PLOT)
-                st.write("Loaded from memory")
-            except FileNotFoundError:
-                net, im, fm = inductive_miner(event_log)
-                           # parameters={heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: 0.99,
-                           #     pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"})
-                #parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"}
-                save_vis_petri_net(net, im, fm, OUTPUT_PLOT)
-                st.write("Saved in: ", OUTPUT_PLOT)
-            fig_pt = mpimg.imread(OUTPUT_PLOT)
-            st.image(fig_pt)
-def show_loaded_event_log(event_log, event_df):
-        get_stats(event_log)
-        st.write("### Loaded event-log")
-        col1, col2 = st.columns(2)
-        with col2:
-            st.dataframe(event_df)
-        with col1:
-            show_process_petrinet(event_log, None, OUTPUT_PATH+"running-example")
-def extract_meta_features(log, log_name):
-    mtf_cols = ["log", "n_traces", "n_unique_traces", "ratio_unique_traces_per_trace", "n_events", "trace_len_min", "trace_len_max",
-                "trace_len_mean", "trace_len_median", "trace_len_mode", "trace_len_std", "trace_len_variance", "trace_len_q1",
-                "trace_len_q3", "trace_len_iqr", "trace_len_geometric_mean", "trace_len_geometric_std", "trace_len_harmonic_mean",
-                "trace_len_skewness", "trace_len_kurtosis", "trace_len_coefficient_variation", "trace_len_entropy", "trace_len_hist1",
-                "trace_len_hist2", "trace_len_hist3", "trace_len_hist4", "trace_len_hist5", "trace_len_hist6", "trace_len_hist7",
-                "trace_len_hist8", "trace_len_hist9", "trace_len_hist10", "trace_len_skewness_hist", "trace_len_kurtosis_hist",
-                "ratio_most_common_variant", "ratio_top_1_variants", "ratio_top_5_variants", "ratio_top_10_variants", "ratio_top_20_variants",
-                "ratio_top_50_variants", "ratio_top_75_variants", "mean_variant_occurrence", "std_variant_occurrence", "skewness_variant_occurrence",
-                "kurtosis_variant_occurrence", "n_unique_activities", "activities_min", "activities_max", "activities_mean", "activities_median",
-                "activities_std", "activities_variance", "activities_q1", "activities_q3", "activities_iqr", "activities_skewness",
-                "activities_kurtosis", "n_unique_start_activities", "start_activities_min", "start_activities_max", "start_activities_mean",
-                "start_activities_median", "start_activities_std", "start_activities_variance", "start_activities_q1", "start_activities_q3",
-                "start_activities_iqr", "start_activities_skewness", "start_activities_kurtosis", "n_unique_end_activities", "end_activities_min",
-                "end_activities_max", "end_activities_mean", "end_activities_median", "end_activities_std", "end_activities_variance",
-                "end_activities_q1", "end_activities_q3", "end_activities_iqr", "end_activities_skewness", "end_activities_kurtosis", "entropy_trace",
-                "entropy_prefix", "entropy_global_block", "entropy_lempel_ziv", "entropy_k_block_diff_1", "entropy_k_block_diff_3",
-                "entropy_k_block_diff_5", "entropy_k_block_ratio_1", "entropy_k_block_ratio_3", "entropy_k_block_ratio_5", "entropy_knn_3",
-                "entropy_knn_5", "entropy_knn_7"]
-    features = [log_name]
-    features.extend(simple_stats(log))
-    features.extend(trace_length(log))
-    features.extend(trace_variant(log))
-    features.extend(activities(log))
-    features.extend(start_activities(log))
-    features.extend(end_activities(log))
-    features.extend(entropies(log_name, OUTPUT_PATH))
-    mtf = pd.DataFrame([features], columns=mtf_cols)
-    st.dataframe(mtf)
-    return mtf
-def generate_pt(mtf):
-    OUTPUT_PLOT = f"{OUTPUT_PATH}/generated_pt".replace(":","").replace(".","")#+".png" # OUTPUT_PATH is OUTPUT_PATH+INPUT_FILE
-    st.write("### PT Gen configurations")
-    col1, col2, col3, col4, col5, col6 = st.columns(6)
-    with col1:
-            param_mode = st.text_input('Mode', str(round(mtf['activities_median'].iat[0]))) #?
-            st.write("Sum of probabilities must be one")
-    with col2:
-            param_min = st.text_input('Min', str(mtf['activities_min'].iat[0]))
-            param_seq = st.text_input('Probability Sequence', 0.25)
-    with col3:
-            param_max = st.text_input('Max', str(mtf['activities_max'].iat[0]))
-            param_cho = st.text_input('Probability Choice (XOR)', 0.25)
-    with col4:
-            param_nmo = st.text_input('Number of models', 1)
-            param_par = st.text_input('Probability Parallel', 0.25)
-    with col5:
-            param_dup = st.text_input('Duplicates', 0)
-            param_lop = st.text_input('Probability Loop', 0.25)
-    with col6:
-            param_sil = st.text_input('Silent', 0.2)
-            param_or = st.text_input('Probability Or', 0.0)
-    PT_PARAMS = {tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.MODE: round(float(param_mode)), #most frequent number of visible activities
-            tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.MIN: int(param_min), #minimum number of visible activities
-            tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.MAX: int(param_max), #maximum number of visible activities
-            tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.SEQUENCE: float(param_seq), #probability to add a sequence operator to tree
-            tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.CHOICE: float(param_cho), #probability to add a choice (XOR) operator to tree
-            tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.PARALLEL: float(param_par), #probability to add a parallel operator to tree
-            tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.LOOP: float(param_lop), #probability to add a loop operator to tree
-            tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.OR: float(param_or), #probability to add an or operator to tree
-            tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.SILENT: float(param_sil), #probability to add silent activity to a choice or loop operator
-            tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.DUPLICATE: int(param_dup), #probability to duplicate an activity label
-            tree_generator.Variants.PTANDLOGGENERATOR.value.Parameters.NO_MODELS: int(param_nmo)} #number of trees to generate from model population
-    process_tree = generate_process_tree(parameters=PT_PARAMS)
-    save_vis_process_tree(process_tree, OUTPUT_PLOT+"_tree.png")
-    st.write("### Playout configurations")
-    param_ntraces = st.text_input('Number of traces', str(mtf['n_traces'].iat[0]))
-    PO_PARAMS = {playout.Variants.BASIC_PLAYOUT.value.Parameters.NO_TRACES : int(param_ntraces)}
-    ptgen_log = play_out(process_tree, parameters=PO_PARAMS)
-    net, im, fm = inductive_miner(ptgen_log)
-    save_vis_petri_net(net, im, fm, OUTPUT_PLOT+".png")
-    st.write("Saved in: ", OUTPUT_PLOT)
-    fig_pt_net = mpimg.imread(OUTPUT_PLOT+".png")
-    fig_pt_tree = mpimg.imread(OUTPUT_PLOT+"_tree.png")
-    fcol1, fcol2 = st.columns(2)
-    with fcol1:
-        st.image(fig_pt_tree)
-    with fcol2:
-        st.image(fig_pt_net)
-    extract_meta_features(ptgen_log, "gen_pt")
-if __name__ == '__main__':
-    st.set_page_config(layout='wide')
-    """
-    # Event Log Generator
-    """
-    start_options =  ['Event-Log', 'Meta-features']
-    start_preference = st.sidebar.selectbox("Do you want to start with a log or with metafeatures?", start_options,0)
-    #lets_start = st.sidebar.button("Let's start with "+start_preference+'!')
-    if start_preference==start_options[0]:
-        st.sidebar.write("Upload a dataset in csv or xes-format:")
-        uploaded_file = st.sidebar.file_uploader("Pick a logfile")
-        bar = st.progress(0)
-        os.makedirs(OUTPUT_PATH, exist_ok=True)
-        event_log = st.session_state['log'] if "log" in st.session_state else None
-        if uploaded_file:
-            event_log, event_df, case_id, activity_id = read_uploaded_file(uploaded_file)
-            #event_log = deepcopy(event_log)
-            use_sample = st.sidebar.checkbox('Use random sample', True)
-            if use_sample:
-                sample_size = st.sidebar.text_input('Sample size of approx number of events', str(SAMPLE_EVENTS))
-                sample_size = int(sample_size)
-                event_log = sample_log_traces(event_log, sample_size)
-                sample_cases = [event_log[i].attributes['concept:name'] for i in range(0, len(event_log))]
-                event_df = event_df[event_df[case_id].isin(sample_cases)]
-            show_loaded_event_log(event_log, event_df)
-            ext_mtf = extract_meta_features(event_log, "running-example")
-            generate_pt(ext_mtf)
-    elif start_preference==start_options[1]:
-        LOG_COL = 'log'
-        st.sidebar.write("Upload a dataset in csv-format")
-        uploaded_file = st.sidebar.file_uploader("Pick a file containing meta-features")
-        bar = st.progress(0)
-        os.makedirs(OUTPUT_PATH, exist_ok=True)
-        event_log = st.session_state[LOG_COL] if "log" in st.session_state else None
-        if uploaded_file:
-            sep = st.sidebar.text_input("Columns separator", ";")
-            mtf = load_from_csv(uploaded_file, sep)
-            st.dataframe(mtf)
-            log_options = mtf['log'].unique()
-            log_preference = st.selectbox("What log should we use for generating a new event-log?", log_options,1)
-            mtf_selection = mtf[mtf[LOG_COL]==log_preference]
-            generate_pt(mtf_selection)
-            st.write("##### Original")
-            st.write(mtf_selection)

gedi/__init__.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from .generator import GenerateEventLogs
 from .features import EventLogFeatures
-from .analyser import FeatureAnalyser
 from .augmentation import InstanceAugmentator
 from .benchmark import BenchmarkTest
 from .plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter

 from .generator import GenerateEventLogs
 from .features import EventLogFeatures
 from .augmentation import InstanceAugmentator
 from .benchmark import BenchmarkTest
 from .plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter

gedi/analyser.py DELETED Viewed

@@ -1,123 +0,0 @@
-import numpy as np
-import warnings
-from sklearn.decomposition import FastICA, PCA
-from sklearn.manifold import TSNE
-from sklearn.preprocessing import Normalizer, StandardScaler
-from gedi.features import EventLogFeatures
-from gedi.plotter import ModelResultPlotter
-from gedi.utils.matrix_tools import insert_missing_data
-# TODO: Call param_keys explicitly e.g. import INPUT_PATH
-from utils.param_keys import *
-from utils.param_keys.analyser import MODEL, INPUT_PARAMS, PERPLEXITY
-# FUDO: Use this class to compare models during evaluation
-class FeatureAnalyser:
-    def __init__(self, features, params=None):
-        self.features: EventLogFeatures = features
-        self.params: dict = {
-            PLOT_TYPE: params.get(PLOT_TYPE, COLOR_MAP),
-            PLOT_TICS: params.get(PLOT_TICS, True),
-            INTERACTIVE: params.get(INTERACTIVE, True),
-            N_COMPONENTS: params.get(N_COMPONENTS, 2),
-            PERPLEXITY: params.get(PERPLEXITY, 3)
-        }
-    def compare(self, model_parameter_list: list[dict], plot_results: bool = True) -> list[dict]:
-        """
-        :param model_parameter_list: list[dict]
-            Different model input parameters, saved in a list
-        :param plot_results: bool
-            Plots the components of the different models (default: True)
-            The function can be calculated
-        :return: list[dict]
-            The results of the models {MODEL, PROJECTION, EXPLAINED_VAR, INPUT_PARAMS}
-        """
-        model_results = []
-        for model_parameters in model_parameter_list:
-            try:
-                model_results.append(self.get_model_result(model_parameters))
-            except np.linalg.LinAlgError as e:
-                warnings.warn(f'Eigenvalue decomposition for model `{model_parameters}` could not be calculated:\n {e}')
-            except AssertionError as e:
-                warnings.warn(f'{e}')
-        if plot_results:
-            self.compare_with_plot(model_results)
-        return model_results
-    def compare_with_plot(self, model_results_list):
-        """
-        This method is used to compare the results in a plot, after fit_transforming different models.
-        @param model_results_list: list[dict]
-            Different model input parameters, saved in a list.
-        """
-        ModelResultPlotter().plot_models(
-            model_results_list,
-            plot_type=self.params[PLOT_TYPE],
-            plot_tics=self.params[PLOT_TICS],
-            components=self.params[N_COMPONENTS]
-        )
-    def get_model_result(self, model_parameters: dict, log: bool = True) -> dict:
-        """
-        Returns a dict of all the important result values. Used for analysing the different models
-        :param model_parameters: dict
-            The input parameters for the model
-        :param log: bool
-            Enables the log output while running the program (default: True)
-        :return: dict of the results: {MODEL, PROJECTION, EXPLAINED_VAR, INPUT_PARAMS}
-        """
-        model, projection = self.get_model_and_projection(model_parameters, log=log)
-        try:
-            ex_var = model.explained_variance_ratio_
-        except AttributeError as e:
-            warnings.warn(str(e))
-            ex_var = 0
-        return {MODEL: model, PROJECTION: projection, EXPLAINED_VAR: ex_var, INPUT_PARAMS: model_parameters}
-    def get_model_and_projection(self, model_parameters: dict, inp: np.ndarray = None, log: bool = True):
-        """
-        This method is fitting a model with the given parameters :model_parameters: and
-        the inp(ut) data is transformed on the model.
-        @param model_parameters: dict
-            The input parameters for the model.
-        @param inp: np.ndarray
-            Input data for the model (optional), (default: None -> calculated on the basis of the model_parameters)
-        @param log: bool
-            Enables the log output while running the program (default: True)
-        @return: fitted model and transformed data
-        """
-        if log:
-            print(f'Running {model_parameters}...')
-        if inp is None:
-            inp = insert_missing_data(self.features.feat)
-        if ALGORITHM_NAME not in model_parameters.keys():
-            raise KeyError(f'{ALGORITHM_NAME} is a mandatory model parameter.')
-        if model_parameters[ALGORITHM_NAME].startswith('normalized'):
-            inp = Normalizer(norm="l2").fit_transform(inp)
-        elif model_parameters[ALGORITHM_NAME].startswith('std_scaled'):
-            scaler = StandardScaler()
-            inp = scaler.fit_transform(inp)
-        try:
-            if 'pca' in model_parameters[ALGORITHM_NAME]:
-                # from sklearn.decomposition import PCA
-                pca = PCA(n_components=self.params[N_COMPONENTS])
-                # pca = coor.pca(data=inp, dim=self.params[N_COMPONENTS])
-                return pca, pca.fit_transform(inp)
-            elif 'tsne' in model_parameters[ALGORITHM_NAME]:
-                tsne = TSNE(n_components=self.params[N_COMPONENTS], learning_rate='auto',
-                            init='random', perplexity=self.params[PERPLEXITY])
-                return tsne, tsne.fit_transform(inp)
-            #elif model_parameters[ALGORITHM_NAME] == 'original_ica':
-            #    ica = FastICA(n_components=self.params[N_COMPONENTS])
-            #    return ica, ica.fit_transform(inp)
-            else:
-                warnings.warn(f'No original algorithm was found with name: {model_parameters[ALGORITHM_NAME]}')
-        except TypeError:
-            raise TypeError(f'Input data of the function is not correct. '
-                            f'Original algorithms take only 2-n-dimensional ndarray')

gedi/utils/algorithms/__init__.py DELETED Viewed

@@ -1,67 +0,0 @@
-import numpy as np
-from sklearn.base import TransformerMixin, BaseEstimator
-from utils.param_keys import N_COMPONENTS
-class MyModel(TransformerMixin, BaseEstimator):
-    """
-    This class and some child classes are partly copied from:
-    https://towardsdatascience.com/implementing-pca-from-scratch-fb434f1acbaa
-    and commented with the help of:
-    https://www.askpython.com/python/examples/principal-component-analysis
-    """
-    def __init__(self):
-        self.explained_variance_ = None
-        self.components_ = None
-        self._standardized_data = None
-        self.n_components = None
-        self.n_samples = None
-        self._covariance_matrix = None
-    def __str__(self):
-        return f'{self.__class__.__name__}:\ncomponents={self.n_components}'
-    def fit_transform(self, data_ndarray, **fit_params):
-        self.fit(data_ndarray, **fit_params)
-        return self.transform(data_ndarray)
-    def fit(self, data_matrix, **fit_params):
-        self.n_samples = data_matrix.shape[0]
-        self.n_components = fit_params.get(N_COMPONENTS, 2)
-        self._standardized_data = self._standardize_data(data_matrix)
-        self._covariance_matrix = self.get_covariance_matrix()
-        self.components_ = self.get_eigenvectors()
-        return self
-    @staticmethod
-    def _standardize_data(matrix):
-        """
-        Subtract mean and divide by standard deviation column-wise.
-        Doing this proves to be very helpful when calculating the covariance matrix.
-        https://towardsdatascience.com/understanding-the-covariance-matrix-92076554ea44
-        Mean-Center the data
-        :param matrix: Data as matrix
-        :return: Standardized data matrix
-        """
-        numerator = matrix - np.mean(matrix, axis=0)
-        denominator = np.std(matrix, axis=0)
-        return numerator / denominator
-    def get_covariance_matrix(self):
-        """
-        Calculate covariance matrix with standardized matrix A
-        :return: Covariance Matrix
-        """
-        return np.cov(self._standardized_data.T)
-    def get_eigenvectors(self):
-        pass
-    def transform(self, data_matrix):
-        """
-        Project the data to the lower dimension with the help of the eigenvectors.
-        :return: Data reduced to lower dimensions from higher dimensions
-        """
-        data_matrix_standardized = self._standardize_data(data_matrix)
-        return np.dot(data_matrix_standardized, self.components_[:, :self.n_components])

gedi/utils/algorithms/tsne.py DELETED Viewed

@@ -1,69 +0,0 @@
-from scipy import spatial
-from tag.utils.algorithms import MyModel
-import sklearn.manifold as sk
-import numpy as np
-from tag.utils.matrix_tools import ensure_matrix_symmetry
-"""
-Parts of this file were originally copied from the tltsne python module.
-https://github.com/spiwokv/tltsne/blob/master/tltsne/__init__.py
-Since the results in the text file are complicated to reuse, this module was modified somewhat.
-This way, the results of the models can be used and it's Object Oriented.
-"""
-class MyTSNE(MyModel):
-    def __init__(self, n_components, perplexity=7.0,
-                 early_exaggeration=12.0, learning_rate="auto",
-                 n_iter=1000, metric="euclidean"):
-        super().__init__()
-        self.model = sk.TSNE(
-            n_components=n_components, perplexity=perplexity,
-            early_exaggeration=early_exaggeration, learning_rate=learning_rate,
-            n_iter=n_iter, metric=metric
-        )
-    def fit_transform(self, data_matrix, **fit_params):
-        return self.model.fit_transform(data_matrix, **fit_params)
-class MyTimeLaggedTSNE(MyTSNE):
-    def __init__(self, lag_time, **kwargs):
-        super().__init__(metric="precomputed", **kwargs)
-        self.lag_time = lag_time
-    def fit_transform(self, data_matrix, **fit_params):
-        data_zero_mean = data_matrix - np.mean(data_matrix, axis=0)
-        cov = np.cov(data_zero_mean.T)
-        eigenvalue, eigenvector = np.linalg.eig(cov)
-        eigenvalue_order = np.argsort(eigenvalue)[::-1]
-        eigenvector = eigenvector[:, eigenvalue_order]
-        eigenvalue = eigenvalue[eigenvalue_order]
-        projection = data_zero_mean.dot(eigenvector) / np.sqrt(eigenvalue)
-        n_frames = fit_params.get('n_frames', 0)
-        if self.lag_time <= 0:
-            covariance_matrix = np.dot(
-                projection[:, np.newaxis].T,
-                projection[:, np.newaxis]
-            ) / (n_frames - 1)
-        else:
-            covariance_matrix = np.dot(
-                projection[:-self.lag_time, np.newaxis].T,
-                projection[self.lag_time:, np.newaxis]
-            ) / (n_frames - self.lag_time - 1)
-        covariance_matrix = ensure_matrix_symmetry(covariance_matrix)
-        eigenvalue2, eigenvector2 = np.linalg.eig(covariance_matrix)
-        eigenvalue_order = np.argsort(eigenvalue2)[::-1]
-        eigenvector2 = eigenvector2[:, eigenvalue_order]
-        eigenvalue2 = eigenvalue2[eigenvalue_order]
-        projection = np.dot(
-            projection,
-            eigenvector2[:, :self.n_components]
-        ) * np.sqrt(np.real(eigenvalue2[:self.n_components]))
-        data_distance = spatial.distance_matrix(projection, projection)
-        return self.model.fit_transform(data_distance)

main.py CHANGED Viewed

@@ -3,7 +3,6 @@ import pandas as pd
 from datetime import datetime as dt
 from gedi.generator import GenerateEventLogs
 from gedi.features import EventLogFeatures
-from gedi.analyser import FeatureAnalyser
 from gedi.augmentation import InstanceAugmentator
 from gedi.benchmark import BenchmarkTest
 from gedi.plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter

 from datetime import datetime as dt
 from gedi.generator import GenerateEventLogs
 from gedi.features import EventLogFeatures
 from gedi.augmentation import InstanceAugmentator
 from gedi.benchmark import BenchmarkTest
 from gedi.plotter import BenchmarkPlotter, FeaturesPlotter, AugmentationPlotter, GenerationPlotter