|
import os
|
|
from os.path import dirname, join
|
|
|
|
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
if __name__ == "__main__":
|
|
data_path = join(dirname(dirname(__file__)), "datasets", "osv5m")
|
|
train_fp = join(data_path, f"train.csv")
|
|
val_fp = join(data_path, f"val.csv")
|
|
os.makedirs(dirname(val_fp), exist_ok=True)
|
|
df = pd.read_csv(train_fp, dtype={"category": str, "country": str, "city": str})
|
|
df_train, df_val = train_test_split(df, stratify=df["category"], test_size=0.1)
|
|
df_train.to_csv(train_fp, index=False)
|
|
df_val.to_csv(val_fp, index=False)
|
|
|