Arafath10 commited on
Commit
b9792ed
·
verified ·
1 Parent(s): 0a66441

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +81 -72
main.py CHANGED
@@ -24,51 +24,25 @@ app.add_middleware(
24
  allow_headers=["*"],
25
  )
26
 
27
-
28
-
29
- def train_the_model(data):
30
- try:
31
- new_data = data
32
- encoders = load('encoders.joblib')
33
- xgb_model = load('xgb_model.joblib')
34
- selected_columns = ['customer_name', 'customer_address', 'customer_phone',
35
- 'customer_email', 'cod', 'weight', 'origin_city.name',
36
- 'destination_city.name', 'status.name']
37
- new_data_filled = new_data[selected_columns].fillna('Missing')
38
- for col, encoder in encoders.items():
39
- if col in new_data_filled.columns:
40
- unseen_categories = set(new_data_filled[col]) - set(encoder.classes_)
41
- if unseen_categories:
42
- for category in unseen_categories:
43
- encoder.classes_ = np.append(encoder.classes_, category)
44
- new_data_filled[col] = encoder.transform(new_data_filled[col])
45
- else:
46
- new_data_filled[col] = encoder.transform(new_data_filled[col])
47
- X_new = new_data_filled.drop('status.name', axis=1)
48
- y_new = new_data_filled['status.name']
49
-
50
- X_train, X_test, y_train, y_test = train_test_split(X_new,y_new, test_size=0.2, random_state=42)
51
-
52
- xgb_model.fit(X_new, y_new)
53
- dump(xgb_model,'xgb_model.joblib')
54
 
55
 
56
- y_pred = xgb_model.predict(X_test)
57
- accuracy = accuracy_score(y_test, y_pred)
58
- classification_rep = classification_report(y_test, y_pred)
59
- return accuracy,classification_rep,"Model finetuned with new data."
60
 
 
 
 
 
 
61
 
62
- except:
63
- data = data
64
-
65
  # Select columns
66
  selected_columns = ['customer_name', 'customer_address', 'customer_phone',
67
- 'customer_email', 'cod', 'weight',
68
- 'origin_city.name', 'destination_city.name', 'status.name']
69
 
70
  # Handling missing values
71
- data_filled = data[selected_columns].fillna('Missing')
 
72
 
73
  # Encoding categorical variables
74
  encoders = {col: LabelEncoder() for col in selected_columns if data_filled[col].dtype == 'object'}
@@ -80,56 +54,56 @@ def train_the_model(data):
80
  y = data_filled['status.name']
81
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
82
 
83
- # Setup the hyperparameter grid to search
84
- param_grid = {
85
- 'max_depth': [3, 4, 5],
86
- 'learning_rate': [0.01, 0.1, 0.4],
87
- 'n_estimators': [100, 200, 300],
88
- 'subsample': [0.8, 0.9, 1],
89
- 'colsample_bytree': [0.3, 0.7]
 
 
 
 
 
 
 
 
 
 
 
 
90
  }
91
 
92
- # Initialize the classifier
93
- xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
94
-
95
- # Setup GridSearchCV
96
- grid_search = GridSearchCV(xgb, param_grid, cv=2, n_jobs=-1, scoring='accuracy')
97
-
98
- # Fit the grid search to the data
99
- grid_search.fit(X_train, y_train)
100
-
101
- # Get the best parameters
102
- best_params = grid_search.best_params_
103
- print("Best parameters:", best_params)
104
 
105
- # Train the model with best parameters
106
- best_xgb = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
107
- best_xgb.fit(X_train, y_train)
108
 
109
  # Predict on the test set
110
- y_pred = best_xgb.predict(X_test)
111
- y_pred_proba = best_xgb.predict_proba(X_test)
112
 
113
  # Evaluate the model
114
  accuracy = accuracy_score(y_test, y_pred)
115
  classification_rep = classification_report(y_test, y_pred)
116
 
117
  # Save the model
118
- model_filename = 'xgb_model.joblib'
119
- dump(best_xgb, model_filename)
120
 
121
  # Save the encoders
122
- encoders_filename = 'encoders.joblib'
123
  dump(encoders, encoders_filename)
124
 
125
- return accuracy,classification_rep,"base Model trained"
126
 
127
  @app.get("/trigger_the_data_fecher")
128
  async def your_continuous_function(page: int,paginate: int,Tenant: str):
129
  print("data fetcher running.....")
130
 
131
- # Initialize an empty DataFrame to store the combined data
132
- combined_df = pd.DataFrame()
133
 
134
  # Update the payload for each page
135
  url = "https://dev3.api.curfox.parallaxtec.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page)
@@ -151,14 +125,49 @@ async def your_continuous_function(page: int,paginate: int,Tenant: str):
151
  df = pd.json_normalize(data)
152
 
153
 
154
- # Concatenate the current page's DataFrame with the combined DataFrame
155
- combined_df = pd.concat([combined_df, df], ignore_index=True)
156
-
157
- data = combined_df[combined_df['status.name'].isin(['RETURN TO CLIENT', 'DELIVERED'])]
158
  print("data collected from page : "+str(page))
159
  #data.to_csv("new.csv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- accuracy,classification_rep,message = train_the_model(data)
162
 
163
  return {"message":message,"page_number":page,"data_count":data_count,"accuracy":accuracy,"classification_rep":classification_rep}
164
 
 
24
  allow_headers=["*"],
25
  )
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
 
29
+ def train_the_model():
 
 
 
30
 
31
+ data = pd.read_csv("model/trainer_data.csv")
32
+ print(data["customer_name"].count())
33
+
34
+ data = pd.read_csv("model/trainer_data_balanced.csv")
35
+ print(data["customer_name"].count())
36
 
37
+
 
 
38
  # Select columns
39
  selected_columns = ['customer_name', 'customer_address', 'customer_phone',
40
+ 'customer_email', 'cod', 'weight', 'origin_city.name',
41
+ 'destination_city.name', 'status.name']
42
 
43
  # Handling missing values
44
+ #data_filled = data[selected_columns].fillna('Missing')
45
+ data_filled = data[selected_columns].dropna()
46
 
47
  # Encoding categorical variables
48
  encoders = {col: LabelEncoder() for col in selected_columns if data_filled[col].dtype == 'object'}
 
54
  y = data_filled['status.name']
55
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
56
 
57
+ # Parameters to use for the model
58
+ # Parameters to use for the model
59
+ """params = {
60
+ 'colsample_bytree': 0.3,
61
+ 'learning_rate': 0.6,
62
+ 'max_depth': 6,
63
+ 'n_estimators': 100,
64
+ 'subsample': 0.9,
65
+ 'use_label_encoder': False,
66
+ 'eval_metric': 'logloss'
67
+ }"""
68
+ params = {
69
+ 'colsample_bytree': 0.9,
70
+ 'learning_rate': 0.1,
71
+ 'max_depth': 30,
72
+ 'n_estimators': 500,
73
+ 'subsample': 0.9,
74
+ 'use_label_encoder': False,
75
+ 'eval_metric': 'logloss'
76
  }
77
 
78
+ # Initialize the classifier with the specified parameters
79
+ xgb = XGBClassifier(**params)
 
 
 
 
 
 
 
 
 
 
80
 
81
+ # Train the model
82
+ xgb.fit(X_train, y_train)
83
+
84
 
85
  # Predict on the test set
86
+ y_pred = xgb.predict(X_test)
87
+ y_pred_proba = xgb.predict_proba(X_test)
88
 
89
  # Evaluate the model
90
  accuracy = accuracy_score(y_test, y_pred)
91
  classification_rep = classification_report(y_test, y_pred)
92
 
93
  # Save the model
94
+ model_filename = 'model/curfox_xgb_model.joblib'
95
+ dump(xgb, model_filename)
96
 
97
  # Save the encoders
98
+ encoders_filename = 'model/curfox_encoders.joblib'
99
  dump(encoders, encoders_filename)
100
 
101
+ return accuracy,classification_rep,"Model trained with new data"
102
 
103
  @app.get("/trigger_the_data_fecher")
104
  async def your_continuous_function(page: int,paginate: int,Tenant: str):
105
  print("data fetcher running.....")
106
 
 
 
107
 
108
  # Update the payload for each page
109
  url = "https://dev3.api.curfox.parallaxtec.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page)
 
125
  df = pd.json_normalize(data)
126
 
127
 
128
+ df = df[df['status.name'].isin(['RETURN TO CLIENT', 'DELIVERED'])]
 
 
 
129
  print("data collected from page : "+str(page))
130
  #data.to_csv("new.csv")
131
+
132
+ try:
133
+ file_path = 'model/trainer_data.csv' # Replace with your file path
134
+ source_csv = pd.read_csv(file_path)
135
+ new_data = df
136
+ combined_df_final = pd.concat([source_csv,new_data], ignore_index=True)
137
+
138
+ combined_df_final.to_csv("model/trainer_data.csv")
139
+ print("data added")
140
+ except:
141
+
142
+ df.to_csv("model/trainer_data.csv")
143
+ print("data created")
144
+
145
+ # Load the dataset
146
+ file_path = 'model/trainer_data.csv' # Update to the correct file path
147
+ data = pd.read_csv(file_path)
148
+ # Analyze class distribution
149
+ class_distribution = data['status.name'].value_counts()
150
+ print("Class Distribution before balancing:\n", class_distribution)
151
+
152
+ # Get the size of the largest class to match other classes' sizes
153
+ max_class_size = class_distribution.max()
154
+
155
+ # Oversampling
156
+ oversampled_data = pd.DataFrame()
157
+ for class_name, group in data.groupby('status.name'):
158
+ oversampled_group = resample(group,
159
+ replace=True, # Sample with replacement
160
+ n_samples=max_class_size, # to match majority class
161
+ random_state=123) # for reproducibility
162
+ oversampled_data = pd.concat([oversampled_data, oversampled_group], axis=0)
163
+
164
+ # Verify new class distribution
165
+ print("Class Distribution after oversampling:\n", oversampled_data['status.name'].value_counts())
166
+
167
+ # Save the balanced dataset if needed
168
+ oversampled_data.to_csv('model/trainer_data_balanced.csv', index=False)
169
 
170
+ accuracy,classification_rep,message = train_the_model()
171
 
172
  return {"message":message,"page_number":page,"data_count":data_count,"accuracy":accuracy,"classification_rep":classification_rep}
173