Spaces:
Running
Running
File size: 5,141 Bytes
43b66f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import pandas as pd
import numpy as np
def get_dataset_info(df):
"""
Get basic information about a dataset.
Args:
df: Pandas DataFrame
Returns:
Dictionary with dataset information
"""
info = {
'rows': df.shape[0],
'columns': df.shape[1],
'missing_values': df.isna().sum().sum(),
'duplicate_rows': df.duplicated().sum(),
'memory_usage': df.memory_usage(deep=True).sum() / (1024 * 1024), # MB
'column_types': df.dtypes.astype(str).value_counts().to_dict(),
'column_info': []
}
# Get info for each column
for col in df.columns:
col_info = {
'name': col,
'type': str(df[col].dtype),
'missing': df[col].isna().sum(),
'missing_pct': (df[col].isna().sum() / len(df)) * 100,
'unique_values': df[col].nunique()
}
# Add additional info for numeric columns
if pd.api.types.is_numeric_dtype(df[col]):
col_info.update({
'min': df[col].min(),
'max': df[col].max(),
'mean': df[col].mean(),
'median': df[col].median(),
'std': df[col].std()
})
# Add additional info for categorical/text columns
elif pd.api.types.is_object_dtype(df[col]):
# Get top values
value_counts = df[col].value_counts().head(5).to_dict()
col_info['top_values'] = value_counts
# Estimate if it's a categorical column
if df[col].nunique() / len(df) < 0.1: # If less than 10% of rows have unique values
col_info['likely_categorical'] = True
else:
col_info['likely_categorical'] = False
info['column_info'].append(col_info)
return info
def detect_dataset_format(df):
"""
Try to detect the format/type of the dataset based on its structure.
Args:
df: Pandas DataFrame
Returns:
String indicating the likely format
"""
# Check for text data
text_cols = 0
for col in df.columns:
if pd.api.types.is_string_dtype(df[col]) and df[col].str.len().mean() > 100:
text_cols += 1
if text_cols / len(df.columns) > 0.5:
return "text"
# Check for time series data
date_cols = 0
for col in df.columns:
if pd.api.types.is_datetime64_dtype(df[col]):
date_cols += 1
if date_cols > 0:
return "time_series"
# Check if it looks like tabular data
numeric_cols = len(df.select_dtypes(include=[np.number]).columns)
categorical_cols = len(df.select_dtypes(include=['object', 'category']).columns)
if numeric_cols > 0 and categorical_cols > 0:
return "mixed"
elif numeric_cols > 0:
return "numeric"
elif categorical_cols > 0:
return "categorical"
# Default
return "generic"
def check_column_completeness(df, threshold=0.8):
"""
Check if columns have good completeness (less than 20% missing values by default).
Args:
df: Pandas DataFrame
threshold: Completeness threshold (0.8 = 80% complete)
Returns:
List of columns with poor completeness
"""
results = []
for col in df.columns:
missing_ratio = df[col].isna().sum() / len(df)
completeness = 1 - missing_ratio
if completeness < threshold:
results.append({
'Column': col,
'Completeness': f"{completeness:.2%}",
'Missing': f"{missing_ratio:.2%}",
'Recommendation': 'Consider imputing or removing this column'
})
return results
def detect_outliers(series, method='iqr', factor=1.5):
"""
Detect outliers in a pandas Series using IQR or Z-score method.
Args:
series: Pandas Series with numeric values
method: 'iqr' or 'zscore'
factor: Multiplier for IQR or Z-score threshold
Returns:
Tuple of (outlier_indices, lower_bound, upper_bound)
"""
if method == 'iqr':
# IQR method
q1 = series.quantile(0.25)
q3 = series.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - factor * iqr
upper_bound = q3 + factor * iqr
outliers = series[(series < lower_bound) | (series > upper_bound)].index.tolist()
else: # zscore
# Z-score method
from scipy import stats
z_scores = stats.zscore(series.dropna())
abs_z_scores = abs(z_scores)
# Filter for Z-scores above threshold
outlier_indices = np.where(abs_z_scores > factor)[0]
outliers = series.dropna().iloc[outlier_indices].index.tolist()
# Compute equivalent bounds for consistency
mean = series.mean()
std = series.std()
lower_bound = mean - factor * std
upper_bound = mean + factor * std
return outliers, lower_bound, upper_bound
|