Spaces:
Running
Running
Create _helper.py
Browse files- utils/_helper.py +55 -0
utils/_helper.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Contains helper functions and variables."""
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
REGION_MAPPING = {
|
7 |
+
**dict.fromkeys(["CT", "ME", "MA", "NH", "RI", "VT", "NJ", "NY", "PA"], "North East"),
|
8 |
+
**dict.fromkeys(
|
9 |
+
["IL", "IN", "MI", "OH", "WI", "IA", "KS", "MN", "MO", "NE", "ND", "SD"], "Mid West" # codespell:ignore
|
10 |
+
),
|
11 |
+
**dict.fromkeys(
|
12 |
+
["DE", "FL", "GA", "MD", "NC", "SC", "VA", "WV", "DC", "AL", "KY", "MS", "TN", "AR", "LA"], "South"
|
13 |
+
),
|
14 |
+
**dict.fromkeys(["AZ", "NM", "OK", "TX"], "South West"),
|
15 |
+
**dict.fromkeys(["CO", "ID", "MT", "NV", "UT", "WY", "AK", "CA", "HI", "OR", "WA"], "West"),
|
16 |
+
**dict.fromkeys(["UM", "PR", "AP", "VI", "AE", "AS", "GU", "FM", "PW", "MP"], "Other"),
|
17 |
+
}
|
18 |
+
|
19 |
+
|
20 |
+
def fill_na_with_random(df, column):
|
21 |
+
"""Fills missing values in a column with random values from the same column."""
|
22 |
+
non_na_values = df[column].dropna().values
|
23 |
+
df[column] = df[column].apply(lambda x: np.random.choice(non_na_values) if pd.isna(x) else x)
|
24 |
+
return df[column]
|
25 |
+
|
26 |
+
|
27 |
+
def clean_data_and_add_columns(data: pd.DataFrame):
|
28 |
+
"""Tidies the original data set, adds new columns, and changes cell values for the purpose of this example."""
|
29 |
+
data = data.rename(
|
30 |
+
columns={
|
31 |
+
"Date Sumbited": "Date Submitted",
|
32 |
+
"Submitted via": "Channel",
|
33 |
+
"Company response to consumer": "Company response - detailed",
|
34 |
+
},
|
35 |
+
)
|
36 |
+
|
37 |
+
# Clean cell values and/or assign different values for the purpose of this example
|
38 |
+
data["Company response - detailed"] = data["Company response - detailed"].replace("Closed", "Closed without relief")
|
39 |
+
data["State"] = data["State"].replace("UNITED STATES MINOR OUTLYING ISLANDS", "UM")
|
40 |
+
data["State"] = fill_na_with_random(data, "State")
|
41 |
+
|
42 |
+
# Convert to correct data type
|
43 |
+
data["Date Received"] = pd.to_datetime(data["Date Received"], format="%m/%d/%y").dt.strftime("%Y-%m-%d")
|
44 |
+
data["Date Submitted"] = pd.to_datetime(data["Date Submitted"], format="%m/%d/%y").dt.strftime("%Y-%m-%d")
|
45 |
+
|
46 |
+
# Create additional columns
|
47 |
+
data["Year-Month Received"] = pd.to_datetime(data["Date Received"], format="%Y-%m-%d").dt.strftime("%Y-%m")
|
48 |
+
data["Region"] = data["State"].map(REGION_MAPPING)
|
49 |
+
data["Company response"] = np.where(
|
50 |
+
data["Company response - detailed"].str.contains("Closed"), "Closed", data["Company response - detailed"]
|
51 |
+
)
|
52 |
+
data["Company response - Closed"] = np.where(
|
53 |
+
data["Company response - detailed"].str.contains("Closed"), data["Company response - detailed"], "Not closed"
|
54 |
+
)
|
55 |
+
return data
|