maxschulz-COL commited on
Commit
993dfd7
·
verified ·
1 Parent(s): 398985a

Create _helper.py

Browse files
Files changed (1) hide show
  1. utils/_helper.py +55 -0
utils/_helper.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Contains helper functions and variables."""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ REGION_MAPPING = {
7
+ **dict.fromkeys(["CT", "ME", "MA", "NH", "RI", "VT", "NJ", "NY", "PA"], "North East"),
8
+ **dict.fromkeys(
9
+ ["IL", "IN", "MI", "OH", "WI", "IA", "KS", "MN", "MO", "NE", "ND", "SD"], "Mid West" # codespell:ignore
10
+ ),
11
+ **dict.fromkeys(
12
+ ["DE", "FL", "GA", "MD", "NC", "SC", "VA", "WV", "DC", "AL", "KY", "MS", "TN", "AR", "LA"], "South"
13
+ ),
14
+ **dict.fromkeys(["AZ", "NM", "OK", "TX"], "South West"),
15
+ **dict.fromkeys(["CO", "ID", "MT", "NV", "UT", "WY", "AK", "CA", "HI", "OR", "WA"], "West"),
16
+ **dict.fromkeys(["UM", "PR", "AP", "VI", "AE", "AS", "GU", "FM", "PW", "MP"], "Other"),
17
+ }
18
+
19
+
20
+ def fill_na_with_random(df, column):
21
+ """Fills missing values in a column with random values from the same column."""
22
+ non_na_values = df[column].dropna().values
23
+ df[column] = df[column].apply(lambda x: np.random.choice(non_na_values) if pd.isna(x) else x)
24
+ return df[column]
25
+
26
+
27
+ def clean_data_and_add_columns(data: pd.DataFrame):
28
+ """Tidies the original data set, adds new columns, and changes cell values for the purpose of this example."""
29
+ data = data.rename(
30
+ columns={
31
+ "Date Sumbited": "Date Submitted",
32
+ "Submitted via": "Channel",
33
+ "Company response to consumer": "Company response - detailed",
34
+ },
35
+ )
36
+
37
+ # Clean cell values and/or assign different values for the purpose of this example
38
+ data["Company response - detailed"] = data["Company response - detailed"].replace("Closed", "Closed without relief")
39
+ data["State"] = data["State"].replace("UNITED STATES MINOR OUTLYING ISLANDS", "UM")
40
+ data["State"] = fill_na_with_random(data, "State")
41
+
42
+ # Convert to correct data type
43
+ data["Date Received"] = pd.to_datetime(data["Date Received"], format="%m/%d/%y").dt.strftime("%Y-%m-%d")
44
+ data["Date Submitted"] = pd.to_datetime(data["Date Submitted"], format="%m/%d/%y").dt.strftime("%Y-%m-%d")
45
+
46
+ # Create additional columns
47
+ data["Year-Month Received"] = pd.to_datetime(data["Date Received"], format="%Y-%m-%d").dt.strftime("%Y-%m")
48
+ data["Region"] = data["State"].map(REGION_MAPPING)
49
+ data["Company response"] = np.where(
50
+ data["Company response - detailed"].str.contains("Closed"), "Closed", data["Company response - detailed"]
51
+ )
52
+ data["Company response - Closed"] = np.where(
53
+ data["Company response - detailed"].str.contains("Closed"), data["Company response - detailed"], "Not closed"
54
+ )
55
+ return data