sepsis-streamlit / utils /janitor.py
gabcares's picture
Streamlit client source code
bfc7b8a verified
raw
history blame
1.77 kB
import pandas as pd
import re
class Janitor:
def __init__(self):
pass
def clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
# Apply all cleaning procedure in sequence
df = df.copy() # First make a copy to preserve integrity of the old df
df = self.drop_duplicates(df)
df = self.snake_case_columns(df)
df = self.fix_none(df)
df = self.fix_datatypes(df)
df = self.dropna_target(df)
df = df.reset_index(drop=True) # Fix index
return df
def drop_duplicates(self, df):
return df.drop_duplicates() if df.duplicated().sum() > 0 else df
def snake_case_columns(self, df):
pattern = r'(?<!^)(?=[A-Z][a-z])|(?<=[a-z])(?=[A-Z])'
df.columns = [re.sub(pattern, '_', column).lower()
for column in df.columns]
return df
def fix_none(self, df):
def replace_none(value):
like_nan = {'none', ''}
if pd.isnull(value) or (isinstance(value, str) and (value.lower().strip() in like_nan)):
value = pd.NA
return value
return df.map(replace_none)
def fix_datatypes(self, df):
columns_int = ['prg', 'pl', 'pr', 'sk', 'ts', 'age']
columns_float = ['m11', 'bd2']
col_to_fix = {col for col in columns_int+columns_float}
if col_to_fix.issubset(df.columns):
df[columns_int] = df[columns_int].astype(int)
df[columns_float] = df[columns_float].astype(float)
return df
# Drop rows with missing values in target column and reset index
def dropna_target(self, df):
return df.dropna(subset='sepsis') if 'sepsis' in df.columns else df