import pandas as pd import numpy as np def get_entire_subset(): """ Returns the entire subset, which is an empty dictionary :return: empty dictionary """ return {} def get_random_subset(coordinates: pd.DataFrame, prob: float, min_elements: int = 0): """ Returns a random subset :param coordinates: data frame containing having as columns the features :param prob: probability to select a value of a feature :param min_elements: minimum number of elements to be included in the randomly generated sub-population :return: dictionary representing a random sub-population """ subset_random_values = {} shuffled_column_names = np.random.permutation(coordinates.columns.values) # consider each column once, in random order for column_name in shuffled_column_names: # get unique values of the current column temp = coordinates[column_name].unique() # include each attribute value with probability = prob mask_values = np.random.rand(len(temp)) < prob if mask_values.sum() < len(temp): # set values for the current column subset_random_values[column_name] = temp[mask_values].tolist() # compute the remaining records mask_subset = coordinates[subset_random_values.keys()].isin(subset_random_values).all(axis=1) remaining_records = len(coordinates.loc[mask_subset]) # only filter on this attribute if at least min_elements records would be kept if remaining_records < min_elements: del subset_random_values[column_name] return subset_random_values