|
from os.path import join, dirname
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
if __name__ == "__main__":
|
|
|
|
cities = [
|
|
"Walvis Bay",
|
|
"Keetmanshoop",
|
|
"Warmbad",
|
|
"Rundu",
|
|
"Outapi",
|
|
"Karibib",
|
|
"Otjimbingwe",
|
|
"Ondangwa",
|
|
"Oranjemund",
|
|
"Maltahohe",
|
|
"Otavi",
|
|
"Outjo",
|
|
"Swakopmund",
|
|
"Gobabis",
|
|
"Karasburg",
|
|
"Opuwo",
|
|
"Hentiesbaai",
|
|
"Katima Mulilo",
|
|
"Oshikango",
|
|
"Bethanie",
|
|
"Ongandjera",
|
|
"Mariental",
|
|
"Bagani",
|
|
"Nkurenkuru",
|
|
"Usakos",
|
|
"Rehoboth",
|
|
"Aranos",
|
|
"Omaruru",
|
|
"Arandis",
|
|
"Windhoek",
|
|
"Khorixas",
|
|
"Okahandja",
|
|
"Grootfontein",
|
|
"Tsumeb",
|
|
]
|
|
|
|
csv_dtype = {"category": str, "country": str, "city": str}
|
|
for split in ["train", "test"]:
|
|
fp = join(
|
|
dirname(dirname(__file__)), "datasets", "osv5m", f"{split}.csv"
|
|
)
|
|
|
|
|
|
df = pd.read_csv(fp, dtype=csv_dtype)
|
|
|
|
|
|
mask = df["city"].isin(cities)
|
|
|
|
|
|
df.loc[mask, "country"] = "NMB"
|
|
assert all(map(lambda x: isinstance(x, str), df["country"].unique().tolist()))
|
|
|
|
|
|
df.dropna(subset=["id", "latitude", "longitude"], inplace=True)
|
|
|
|
|
|
df.to_csv(fp, index=False)
|
|
|