leadingbridge commited on
Commit
1c1ad43
·
verified ·
1 Parent(s): d548c67

Create train_model.py

Browse files
Files changed (1) hide show
  1. train_model.py +39 -0
train_model.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.ensemble import RandomForestRegressor
5
+ import joblib
6
+
7
+ # URL to the Excel dataset on Hugging Face
8
+ data_url = "https://huggingface.co/datasets/leadingbridge/flat/resolve/main/NorthPoint30.xlsx"
9
+
10
+ # Load dataset
11
+ df = pd.read_excel(data_url, engine="openpyxl")
12
+
13
+ # Drop columns that are not needed for prediction
14
+ cols_to_drop = ['Usage', 'Address', 'PricePerSquareFeet', 'InstrumentDate', 'Floor', 'Unit']
15
+ df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
16
+
17
+ # Rename useful columns for consistency
18
+ df.rename(columns={"Floor.1": "Floor", "Unit.1": "Unit"}, inplace=True)
19
+
20
+ # Define features and target variable
21
+ feature_names = ['District', 'Longitude', 'Latitude', 'Floor', 'Unit', 'Area', 'Year', 'WeekNumber']
22
+ X = df[feature_names]
23
+ y = df['PriceInMillion']
24
+
25
+ # Train/test split
26
+ X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42)
27
+
28
+ # Train a fixed-parameter RandomForest (no grid search)
29
+ model = RandomForestRegressor(
30
+ n_estimators=100,
31
+ max_depth=6,
32
+ max_features='sqrt',
33
+ random_state=42
34
+ )
35
+ model.fit(X_train, y_train)
36
+
37
+ # Save model and feature list
38
+ joblib.dump({"model": model, "features": feature_names}, "model.pkl")
39
+ print("✅ Model trained and saved to model.pkl")