File size: 5,141 Bytes
43b66f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import pandas as pd
import numpy as np

def get_dataset_info(df):
    """
    Get basic information about a dataset.
    
    Args:
        df: Pandas DataFrame
        
    Returns:
        Dictionary with dataset information
    """
    info = {
        'rows': df.shape[0],
        'columns': df.shape[1],
        'missing_values': df.isna().sum().sum(),
        'duplicate_rows': df.duplicated().sum(),
        'memory_usage': df.memory_usage(deep=True).sum() / (1024 * 1024),  # MB
        'column_types': df.dtypes.astype(str).value_counts().to_dict(),
        'column_info': []
    }
    
    # Get info for each column
    for col in df.columns:
        col_info = {
            'name': col,
            'type': str(df[col].dtype),
            'missing': df[col].isna().sum(),
            'missing_pct': (df[col].isna().sum() / len(df)) * 100,
            'unique_values': df[col].nunique()
        }
        
        # Add additional info for numeric columns
        if pd.api.types.is_numeric_dtype(df[col]):
            col_info.update({
                'min': df[col].min(),
                'max': df[col].max(),
                'mean': df[col].mean(),
                'median': df[col].median(),
                'std': df[col].std()
            })
        
        # Add additional info for categorical/text columns
        elif pd.api.types.is_object_dtype(df[col]):
            # Get top values
            value_counts = df[col].value_counts().head(5).to_dict()
            col_info['top_values'] = value_counts
            
            # Estimate if it's a categorical column
            if df[col].nunique() / len(df) < 0.1:  # If less than 10% of rows have unique values
                col_info['likely_categorical'] = True
            else:
                col_info['likely_categorical'] = False
        
        info['column_info'].append(col_info)
    
    return info

def detect_dataset_format(df):
    """
    Try to detect the format/type of the dataset based on its structure.
    
    Args:
        df: Pandas DataFrame
        
    Returns:
        String indicating the likely format
    """
    # Check for text data
    text_cols = 0
    for col in df.columns:
        if pd.api.types.is_string_dtype(df[col]) and df[col].str.len().mean() > 100:
            text_cols += 1
    
    if text_cols / len(df.columns) > 0.5:
        return "text"
    
    # Check for time series data
    date_cols = 0
    for col in df.columns:
        if pd.api.types.is_datetime64_dtype(df[col]):
            date_cols += 1
    
    if date_cols > 0:
        return "time_series"
    
    # Check if it looks like tabular data
    numeric_cols = len(df.select_dtypes(include=[np.number]).columns)
    categorical_cols = len(df.select_dtypes(include=['object', 'category']).columns)
    
    if numeric_cols > 0 and categorical_cols > 0:
        return "mixed"
    elif numeric_cols > 0:
        return "numeric"
    elif categorical_cols > 0:
        return "categorical"
    
    # Default
    return "generic"

def check_column_completeness(df, threshold=0.8):
    """
    Check if columns have good completeness (less than 20% missing values by default).
    
    Args:
        df: Pandas DataFrame
        threshold: Completeness threshold (0.8 = 80% complete)
        
    Returns:
        List of columns with poor completeness
    """
    results = []
    for col in df.columns:
        missing_ratio = df[col].isna().sum() / len(df)
        completeness = 1 - missing_ratio
        
        if completeness < threshold:
            results.append({
                'Column': col,
                'Completeness': f"{completeness:.2%}",
                'Missing': f"{missing_ratio:.2%}",
                'Recommendation': 'Consider imputing or removing this column'
            })
    
    return results

def detect_outliers(series, method='iqr', factor=1.5):
    """
    Detect outliers in a pandas Series using IQR or Z-score method.
    
    Args:
        series: Pandas Series with numeric values
        method: 'iqr' or 'zscore'
        factor: Multiplier for IQR or Z-score threshold
        
    Returns:
        Tuple of (outlier_indices, lower_bound, upper_bound)
    """
    if method == 'iqr':
        # IQR method
        q1 = series.quantile(0.25)
        q3 = series.quantile(0.75)
        iqr = q3 - q1
        
        lower_bound = q1 - factor * iqr
        upper_bound = q3 + factor * iqr
        
        outliers = series[(series < lower_bound) | (series > upper_bound)].index.tolist()
        
    else:  # zscore
        # Z-score method
        from scipy import stats
        z_scores = stats.zscore(series.dropna())
        abs_z_scores = abs(z_scores)
        
        # Filter for Z-scores above threshold
        outlier_indices = np.where(abs_z_scores > factor)[0]
        outliers = series.dropna().iloc[outlier_indices].index.tolist()
        
        # Compute equivalent bounds for consistency
        mean = series.mean()
        std = series.std()
        lower_bound = mean - factor * std
        upper_bound = mean + factor * std
    
    return outliers, lower_bound, upper_bound