curfox_model_trainer

Sleeping

App Files Files Community

Arafath10 commited on May 4, 2024

Commit

b9792ed

verified ·

1 Parent(s): 0a66441

Update main.py

Browse files

Files changed (1) hide show

main.py +81 -72

main.py CHANGED Viewed

@@ -24,51 +24,25 @@ app.add_middleware(
     allow_headers=["*"],
 )
-def train_the_model(data):
-    try:
-        new_data = data
-        encoders = load('encoders.joblib')
-        xgb_model = load('xgb_model.joblib')
-        selected_columns = ['customer_name', 'customer_address', 'customer_phone',
-                            'customer_email', 'cod', 'weight', 'origin_city.name',
-                            'destination_city.name', 'status.name']
-        new_data_filled = new_data[selected_columns].fillna('Missing')
-        for col, encoder in encoders.items():
-            if col in new_data_filled.columns:
-                unseen_categories = set(new_data_filled[col]) - set(encoder.classes_)
-                if unseen_categories:
-                    for category in unseen_categories:
-                        encoder.classes_ = np.append(encoder.classes_, category)
-                    new_data_filled[col] = encoder.transform(new_data_filled[col])
-                else:
-                    new_data_filled[col] = encoder.transform(new_data_filled[col])
-        X_new = new_data_filled.drop('status.name', axis=1)
-        y_new = new_data_filled['status.name']
-        X_train, X_test, y_train, y_test = train_test_split(X_new,y_new, test_size=0.2, random_state=42)
-        xgb_model.fit(X_new, y_new)
-        dump(xgb_model,'xgb_model.joblib')
-        y_pred = xgb_model.predict(X_test)
-        accuracy = accuracy_score(y_test, y_pred)
-        classification_rep = classification_report(y_test, y_pred)
-        return accuracy,classification_rep,"Model finetuned with new data."
-    except:
-        data = data
         # Select columns
         selected_columns = ['customer_name', 'customer_address', 'customer_phone',
-                            'customer_email', 'cod', 'weight',
-                            'origin_city.name', 'destination_city.name', 'status.name']
         # Handling missing values
-        data_filled = data[selected_columns].fillna('Missing')
         # Encoding categorical variables
         encoders = {col: LabelEncoder() for col in selected_columns if data_filled[col].dtype == 'object'}
@@ -80,56 +54,56 @@ def train_the_model(data):
         y = data_filled['status.name']
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-        # Setup the hyperparameter grid to search
-        param_grid = {
-            'max_depth': [3, 4, 5],
-            'learning_rate': [0.01, 0.1, 0.4],
-            'n_estimators': [100, 200, 300],
-            'subsample': [0.8, 0.9, 1],
-            'colsample_bytree': [0.3, 0.7]
         }
-        # Initialize the classifier
-        xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
-        # Setup GridSearchCV
-        grid_search = GridSearchCV(xgb, param_grid, cv=2, n_jobs=-1, scoring='accuracy')
-        # Fit the grid search to the data
-        grid_search.fit(X_train, y_train)
-        # Get the best parameters
-        best_params = grid_search.best_params_
-        print("Best parameters:", best_params)
-        # Train the model with best parameters
-        best_xgb = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
-        best_xgb.fit(X_train, y_train)
         # Predict on the test set
-        y_pred = best_xgb.predict(X_test)
-        y_pred_proba = best_xgb.predict_proba(X_test)
         # Evaluate the model
         accuracy = accuracy_score(y_test, y_pred)
         classification_rep = classification_report(y_test, y_pred)
         # Save the model
-        model_filename = 'xgb_model.joblib'
-        dump(best_xgb, model_filename)
         # Save the encoders
-        encoders_filename = 'encoders.joblib'
         dump(encoders, encoders_filename)
-        return accuracy,classification_rep,"base Model trained"
 @app.get("/trigger_the_data_fecher")
 async def your_continuous_function(page: int,paginate: int,Tenant: str):
     print("data fetcher running.....")
-    # Initialize an empty DataFrame to store the combined data
-    combined_df = pd.DataFrame()
     # Update the payload for each page
     url = "https://dev3.api.curfox.parallaxtec.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page)
@@ -151,14 +125,49 @@ async def your_continuous_function(page: int,paginate: int,Tenant: str):
     df = pd.json_normalize(data)
-    # Concatenate the current page's DataFrame with the combined DataFrame
-    combined_df = pd.concat([combined_df, df], ignore_index=True)
-    data = combined_df[combined_df['status.name'].isin(['RETURN TO CLIENT', 'DELIVERED'])]
     print("data collected from page : "+str(page))
     #data.to_csv("new.csv")
-    accuracy,classification_rep,message = train_the_model(data)
     return {"message":message,"page_number":page,"data_count":data_count,"accuracy":accuracy,"classification_rep":classification_rep}

     allow_headers=["*"],
 )
+def train_the_model():
+        data = pd.read_csv("model/trainer_data.csv")
+        print(data["customer_name"].count())
+        data = pd.read_csv("model/trainer_data_balanced.csv")
+        print(data["customer_name"].count())
         # Select columns
         selected_columns = ['customer_name', 'customer_address', 'customer_phone',
+                            'customer_email', 'cod', 'weight', 'origin_city.name',
+                            'destination_city.name', 'status.name']
         # Handling missing values
+        #data_filled = data[selected_columns].fillna('Missing')
+        data_filled = data[selected_columns].dropna()
         # Encoding categorical variables
         encoders = {col: LabelEncoder() for col in selected_columns if data_filled[col].dtype == 'object'}
         y = data_filled['status.name']
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+        # Parameters to use for the model
+        # Parameters to use for the model
+        """params = {
+            'colsample_bytree': 0.3,
+            'learning_rate': 0.6,
+            'max_depth': 6,
+            'n_estimators': 100,
+            'subsample': 0.9,
+            'use_label_encoder': False,
+            'eval_metric': 'logloss'
+        }"""
+        params = {
+            'colsample_bytree': 0.9,
+            'learning_rate': 0.1,
+            'max_depth': 30,
+            'n_estimators': 500,
+            'subsample': 0.9,
+            'use_label_encoder': False,
+            'eval_metric': 'logloss'
         }
+        # Initialize the classifier with the specified parameters
+        xgb = XGBClassifier(**params)
+        # Train the model
+        xgb.fit(X_train, y_train)
         # Predict on the test set
+        y_pred = xgb.predict(X_test)
+        y_pred_proba = xgb.predict_proba(X_test)
         # Evaluate the model
         accuracy = accuracy_score(y_test, y_pred)
         classification_rep = classification_report(y_test, y_pred)
         # Save the model
+        model_filename = 'model/curfox_xgb_model.joblib'
+        dump(xgb, model_filename)
         # Save the encoders
+        encoders_filename = 'model/curfox_encoders.joblib'
         dump(encoders, encoders_filename)
+        return accuracy,classification_rep,"Model trained with new data"
 @app.get("/trigger_the_data_fecher")
 async def your_continuous_function(page: int,paginate: int,Tenant: str):
     print("data fetcher running.....")
     # Update the payload for each page
     url = "https://dev3.api.curfox.parallaxtec.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page)
     df = pd.json_normalize(data)
+    df = df[df['status.name'].isin(['RETURN TO CLIENT', 'DELIVERED'])]
     print("data collected from page : "+str(page))
     #data.to_csv("new.csv")
+    try:
+        file_path = 'model/trainer_data.csv'  # Replace with your file path
+        source_csv = pd.read_csv(file_path)
+        new_data = df
+        combined_df_final = pd.concat([source_csv,new_data], ignore_index=True)
+        combined_df_final.to_csv("model/trainer_data.csv")
+        print("data added")
+    except:
+        df.to_csv("model/trainer_data.csv")
+        print("data created")
+    # Load the dataset
+    file_path = 'model/trainer_data.csv'  # Update to the correct file path
+    data = pd.read_csv(file_path)
+    # Analyze class distribution
+    class_distribution = data['status.name'].value_counts()
+    print("Class Distribution before balancing:\n", class_distribution)
+    # Get the size of the largest class to match other classes' sizes
+    max_class_size = class_distribution.max()
+    # Oversampling
+    oversampled_data = pd.DataFrame()
+    for class_name, group in data.groupby('status.name'):
+        oversampled_group = resample(group,
+                                     replace=True,  # Sample with replacement
+                                     n_samples=max_class_size,  # to match majority class
+                                     random_state=123)  # for reproducibility
+        oversampled_data = pd.concat([oversampled_data, oversampled_group], axis=0)
+    # Verify new class distribution
+    print("Class Distribution after oversampling:\n", oversampled_data['status.name'].value_counts())
+    # Save the balanced dataset if needed
+    oversampled_data.to_csv('model/trainer_data_balanced.csv', index=False)
+    accuracy,classification_rep,message = train_the_model()
     return {"message":message,"page_number":page,"data_count":data_count,"accuracy":accuracy,"classification_rep":classification_rep}