Spaces:

datajoi
/

Dataset-Test-Workflow

Sleeping

App Files Files Community

Mustehson commited on Nov 12, 2024

Commit

4e0396a

verified ·

1 Parent(s): a6502a8

Delete utils.py

Browse files

Files changed (1) hide show

utils.py +0 -162

utils.py DELETED Viewed

@@ -1,162 +0,0 @@
-import numpy as np
-import pandas as pd
-# -----------------Numerical Statistics-----------------
-def format_values(key, value):
-    if not isinstance(value, (int, float)):
-        # if value is a time
-        return str(value)
-    if "Memory" in key:
-        # for memory usage
-        ind = 0
-        unit = dict(enumerate(["B", "KB", "MB", "GB", "TB"], 0))
-        while value > 1024:
-            value /= 1024
-            ind += 1
-        return f"{value:.1f} {unit[ind]}"
-    if (value * 10) % 10 == 0:
-        # if value is int but in a float form with 0 at last digit
-        value = int(value)
-        if abs(value) >= 1000000:
-            return f"{value:.5g}"
-    elif abs(value) >= 1000000 or abs(value) < 0.001:
-        value = f"{value:.5g}"
-    elif abs(value) >= 1:
-        # eliminate trailing zeros
-        pre_value = float(f"{value:.4f}")
-        value = int(pre_value) if (pre_value * 10) % 10 == 0 else pre_value
-    elif 0.001 <= abs(value) < 1:
-        value = f"{value:.4g}"
-    else:
-        value = str(value)
-    if "%" in key:
-        # for percentage, only use digits before notation sign for extreme small number
-        value = f"{float(value):.1%}"
-    return str(value)
-def format_num_stats(data):
-    """
-    Format numerical statistics
-    """
-    overview = {
-        "Approximate Distinct Count": data["nuniq"],
-        "Approximate Unique (%)": data["nuniq"] / data["npres"],
-        "Missing": data["nrows"] - data["npres"],
-        "Missing (%)": 1 - (data["npres"] / data["nrows"]),
-        "Infinite": (data["npres"] - data["nreals"]),
-        "Infinite (%)": (data["npres"] - data["nreals"]) / data["nrows"],
-        "Memory Size": data["mem_use"],
-        "Mean": data["mean"],
-        "Minimum": data["min"],
-        "Maximum": data["max"],
-        "Zeros": data["nzero"],
-        "Zeros (%)": data["nzero"] / data["nrows"],
-        "Negatives": data["nneg"],
-        "Negatives (%)": data["nneg"] / data["nrows"],
-    }
-    data["qntls"].index = np.round(data["qntls"].index, 2)
-    quantile = {
-        "Minimum": data["min"],
-        "5-th Percentile": data["qntls"].loc[0.05],
-        "Q1": data["qntls"].loc[0.25],
-        "Median": data["qntls"].loc[0.50],
-        "Q3": data["qntls"].loc[0.75],
-        "95-th Percentile": data["qntls"].loc[0.95],
-        "Maximum": data["max"],
-        "Range": data["max"] - data["min"],
-        "IQR": data["qntls"].loc[0.75] - data["qntls"].loc[0.25],
-    }
-    descriptive = {
-        "Mean": data["mean"],
-        "Standard Deviation": data["std"],
-        "Variance": data["std"] ** 2,
-        "Sum": data["mean"] * data["npres"],
-        "Skewness": float(data["skew"]),
-        "Kurtosis": float(data["kurt"]),
-        "Coefficient of Variation": data["std"] / data["mean"] if data["mean"] != 0 else np.nan,
-    }
-    # return {
-    #     "Overview": {k: _format_values(k, v) for k, v in overview.items()},
-    #     # "Quantile Statistics": {k: _format_values(k, v) for k, v in quantile.items()},
-    #     # "Descriptive Statistics": {k: _format_values(k, v) for k, v in descriptive.items()},
-    # }
-    return {
-    "Overview": {**{k: format_values(k, v) for k, v in overview.items()},
-                 **{k: format_values(k, v) for k, v in quantile.items()},
-                 **{k: format_values(k, v) for k, v in descriptive.items()}}
-      }
-# -----------------------------------------------------
-# -----------------Categorical Statistics-----------------
-def format_cat_stats(
-    data
-):
-    """
-    Format categorical statistics
-    """
-    stats = data['stats']
-    len_stats = data['len_stats']
-    letter_stats = data["letter_stats"]
-    ov_stats = {
-        "Approximate Distinct Count": stats["nuniq"],
-        "Approximate Unique (%)": stats["nuniq"] / stats["npres"],
-        "Missing": stats["nrows"] - stats["npres"],
-        "Missing (%)": 1 - stats["npres"] / stats["nrows"],
-        "Memory Size": stats["mem_use"],
-    }
-    sampled_rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row")
-    smpl = dict(zip(sampled_rows, stats["first_rows"]))
-    # return {
-    #     "Overview": {k: _format_values(k, v) for k, v in ov_stats.items()},
-    #     "Length": {k: _format_values(k, v) for k, v in len_stats.items()},
-    #     "Sample": {k: f"{v[:18]}..." if len(v) > 18 else v for k, v in smpl.items()},
-    #     "Letter": {k: _format_values(k, v) for k, v in letter_stats.items()},
-    # }
-    return {
-    "Overview": {**{k: format_values(k, v) for k, v in ov_stats.items()},
-                 **{k: format_values(k, v) for k, v in len_stats.items()},
-      }
-    }
-# -----------------------------------------------------
-def format_ov_stats(stats) :
-    nrows, ncols, npresent_cells, nrows_wo_dups, mem_use, dtypes_cnt = stats.values()
-    ncells = nrows * ncols
-    data = {
-        "Number of Variables": ncols,
-        "Number of Rows": nrows,
-        "Missing Cells": float(ncells - npresent_cells),
-        "Missing Cells (%)": 1 - (npresent_cells / ncells),
-        "Duplicate Rows": nrows - nrows_wo_dups,
-        "Duplicate Rows (%)": 1 - (nrows_wo_dups / nrows),
-        "Total Size in Memory": float(mem_use),
-        "Average Row Size in Memory": mem_use / nrows,
-    }
-    return {k: format_values(k, v) for k, v in data.items()}, dtypes_cnt
-def format_insights(data):
-    data_list = []
-    for key, value_list in data.items():
-        for item in value_list:
-            for category, description in item.items():
-                data_list.append({'Category': category, 'Description': description})
-    insights_df = pd.DataFrame(data_list)
-    insights_df['Description'] = insights_df['Description'].str.replace(r'/\*start\*/', '', regex=True)
-    insights_df['Description'] = insights_df['Description'].str.replace(r'/\*end\*/', '', regex=True)
-    return insights_df