Spaces:

whackthejacker
/

DataHubHub

Running

File size: 5,141 Bytes

43b66f1

import pandas as pd
import numpy as np

def get_dataset_info(df):
    """
    Get basic information about a dataset.
    
    Args:
        df: Pandas DataFrame
        
    Returns:
        Dictionary with dataset information
    """
    info = {
        'rows': df.shape[0],
        'columns': df.shape[1],
        'missing_values': df.isna().sum().sum(),
        'duplicate_rows': df.duplicated().sum(),
        'memory_usage': df.memory_usage(deep=True).sum() / (1024 * 1024),  # MB
        'column_types': df.dtypes.astype(str).value_counts().to_dict(),
        'column_info': []
    }
    
    # Get info for each column
    for col in df.columns:
        col_info = {
            'name': col,
            'type': str(df[col].dtype),
            'missing': df[col].isna().sum(),
            'missing_pct': (df[col].isna().sum() / len(df)) * 100,
            'unique_values': df[col].nunique()
        }
        
        # Add additional info for numeric columns
        if pd.api.types.is_numeric_dtype(df[col]):
            col_info.update({
                'min': df[col].min(),
                'max': df[col].max(),
                'mean': df[col].mean(),
                'median': df[col].median(),
                'std': df[col].std()
            })
        
        # Add additional info for categorical/text columns
        elif pd.api.types.is_object_dtype(df[col]):
            # Get top values
            value_counts = df[col].value_counts().head(5).to_dict()
            col_info['top_values'] = value_counts
            
            # Estimate if it's a categorical column
            if df[col].nunique() / len(df) < 0.1:  # If less than 10% of rows have unique values
                col_info['likely_categorical'] = True
            else:
                col_info['likely_categorical'] = False
        
        info['column_info'].append(col_info)
    
    return info

def detect_dataset_format(df):
    """
    Try to detect the format/type of the dataset based on its structure.
    
    Args:
        df: Pandas DataFrame
        
    Returns:
        String indicating the likely format
    """
    # Check for text data
    text_cols = 0
    for col in df.columns:
        if pd.api.types.is_string_dtype(df[col]) and df[col].str.len().mean() > 100:
            text_cols += 1
    
    if text_cols / len(df.columns) > 0.5:
        return "text"
    
    # Check for time series data
    date_cols = 0
    for col in df.columns:
        if pd.api.types.is_datetime64_dtype(df[col]):
            date_cols += 1
    
    if date_cols > 0:
        return "time_series"
    
    # Check if it looks like tabular data
    numeric_cols = len(df.select_dtypes(include=[np.number]).columns)
    categorical_cols = len(df.select_dtypes(include=['object', 'category']).columns)
    
    if numeric_cols > 0 and categorical_cols > 0:
        return "mixed"
    elif numeric_cols > 0:
        return "numeric"
    elif categorical_cols > 0:
        return "categorical"
    
    # Default
    return "generic"

def check_column_completeness(df, threshold=0.8):
    """
    Check if columns have good completeness (less than 20% missing values by default).
    
    Args:
        df: Pandas DataFrame
        threshold: Completeness threshold (0.8 = 80% complete)
        
    Returns:
        List of columns with poor completeness
    """
    results = []
    for col in df.columns:
        missing_ratio = df[col].isna().sum() / len(df)
        completeness = 1 - missing_ratio
        
        if completeness < threshold:
            results.append({
                'Column': col,
                'Completeness': f"{completeness:.2%}",
                'Missing': f"{missing_ratio:.2%}",
                'Recommendation': 'Consider imputing or removing this column'
            })
    
    return results

def detect_outliers(series, method='iqr', factor=1.5):
    """
    Detect outliers in a pandas Series using IQR or Z-score method.
    
    Args:
        series: Pandas Series with numeric values
        method: 'iqr' or 'zscore'
        factor: Multiplier for IQR or Z-score threshold
        
    Returns:
        Tuple of (outlier_indices, lower_bound, upper_bound)
    """
    if method == 'iqr':
        # IQR method
        q1 = series.quantile(0.25)
        q3 = series.quantile(0.75)
        iqr = q3 - q1
        
        lower_bound = q1 - factor * iqr
        upper_bound = q3 + factor * iqr
        
        outliers = series[(series < lower_bound) | (series > upper_bound)].index.tolist()
        
    else:  # zscore
        # Z-score method
        from scipy import stats
        z_scores = stats.zscore(series.dropna())
        abs_z_scores = abs(z_scores)
        
        # Filter for Z-scores above threshold
        outlier_indices = np.where(abs_z_scores > factor)[0]
        outliers = series.dropna().iloc[outlier_indices].index.tolist()
        
        # Compute equivalent bounds for consistency
        mean = series.mean()
        std = series.std()
        lower_bound = mean - factor * std
        upper_bound = mean + factor * std
    
    return outliers, lower_bound, upper_bound