|
import pandas as pd |
|
|
|
COLUMN_MAP = { |
|
"tpep_pickup_datetime": "pickup", |
|
"tpep_dropoff_datetime": "dropoff", |
|
"passenger_count": "passengers", |
|
"trip_distance": "distance", |
|
"fare_amount": "fare", |
|
"tip_amount": "tip", |
|
"tolls_amount": "tolls", |
|
"total_amount": "total", |
|
"color": "color", |
|
} |
|
|
|
PAYMENT_TYPES = { |
|
1: "credit card", |
|
2: "cash", |
|
} |
|
|
|
MAX_TRIP_DURATION = 8000 |
|
|
|
if __name__ == "__main__": |
|
|
|
raw = pd.read_csv( |
|
"raw/taxis.csv", |
|
parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"] |
|
).rename(columns=str.lower) |
|
|
|
loc = pd.read_csv("raw/taxi_zones.csv").set_index("LocationID").drop_duplicates() |
|
|
|
clean = ( |
|
raw[list(COLUMN_MAP)] |
|
.rename(columns=COLUMN_MAP) |
|
.assign(payment=raw["payment_type"].map(PAYMENT_TYPES)) |
|
.assign(pickup_zone=raw["pulocationid"].map(loc["zone"])) |
|
.assign(dropoff_zone=raw["dolocationid"].map(loc["zone"])) |
|
.assign(pickup_borough=raw["pulocationid"].map(loc["borough"])) |
|
.assign(dropoff_borough=raw["dolocationid"].map(loc["borough"])) |
|
.loc[lambda x: x["dropoff_borough"] != "EWR"] |
|
.loc[lambda x: x.eval("dropoff - pickup").dt.seconds < MAX_TRIP_DURATION] |
|
.loc[lambda x: (x["fare"] > 0) & (x["fare"] < 200)] |
|
.loc[lambda x: (x["tip"] / x["fare"]) < 1] |
|
) |
|
|
|
clean.to_csv("taxis.csv", index=False) |