import asyncio from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware import requests import pandas as pd import json import httpx,os,datetime import pandas as pd from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import LabelEncoder from xgboost import XGBClassifier from sklearn.utils import resample from sklearn.metrics import accuracy_score, classification_report from joblib import dump, load import numpy as np app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.get("/train_the_model_new_v2") async def train_the_model(Tenant: str): # Load the dataset data = pd.read_csv(f"model/{Tenant}trainer_data_v1.csv") print(data["customer_name"].count()) # Analyze class distribution class_distribution = data['status.name'].value_counts() bf = str(class_distribution) print("Class Distribution before balancing:\n", class_distribution) # Get the size of the largest class to match other classes' sizes max_class_size = class_distribution.max() # Oversampling oversampled_data = pd.DataFrame() for class_name, group in data.groupby('status.name'): oversampled_group = resample(group, replace=True, # Sample with replacement n_samples=max_class_size, # to match majority class random_state=123) # for reproducibility oversampled_data = pd.concat([oversampled_data, oversampled_group], axis=0) # Verify new class distribution print("Class Distribution after oversampling:\n", oversampled_data['status.name'].value_counts()) data = oversampled_data # Select columns 'customer_email' selected_columns = ['customer_name', 'customer_address', 'customer_phone', 'cod', 'weight', 'origin_city.name', 'destination_city.name','status.name','created_at'] # Handling missing values #data_filled = data[selected_columns].fillna('Missing') data_filled = data[selected_columns].dropna() data_filled['customer_phone'] = data_filled['customer_phone'].astype(str) data_filled['created_at'] = data_filled['created_at'].astype(str) #data_filled = data_filled.drop(columns=['created_at']) af = str(oversampled_data['status.name'].value_counts()) # Encoding categorical variables encoders = {col: LabelEncoder() for col in selected_columns if data_filled[col].dtype == 'object'} for col, encoder in encoders.items(): data_filled[col] = encoder.fit_transform(data_filled[col]) # Splitting the dataset X = data_filled.drop('status.name', axis=1) y = data_filled['status.name'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Parameters to use for the model # Parameters to use for the model """params = { 'colsample_bytree': 0.3, 'learning_rate': 0.6, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.9, 'use_label_encoder': False, 'eval_metric': 'logloss' }""" params = { 'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 30, 'n_estimators': 600, 'subsample': 0.9, 'use_label_encoder': False, 'eval_metric': 'logloss' } # Initialize the classifier with the specified parameters xgb = XGBClassifier(**params) # Train the model xgb.fit(X_train, y_train) # Predict on the test set y_pred = xgb.predict(X_test) y_pred_proba = xgb.predict_proba(X_test) # Evaluate the model accuracy = accuracy_score(y_test, y_pred) classification_rep = classification_report(y_test, y_pred) # Save the model model_filename = f'model/{Tenant}_curfox_xgb_model.joblib' dump(xgb, model_filename) # Save the encoders encoders_filename = f'model/{Tenant}_curfox_encoders.joblib' dump(encoders, encoders_filename) return accuracy,classification_rep,"Model trained with new data for :",model_filename,str(af),str(bf) @app.get("/trigger_the_data_fecher_for_me") async def continuous_function(page: int,paginate: int,Tenant: str): print("data fetcher running.....") # Update the payload for each page #url = "https://dev3.api.curfox.parallaxtec.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page) url = "https://v1.api.curfox.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page) payload = {} headers = { 'Accept': 'application/json', 'X-Tenant': Tenant #'royalexpress' } response = requests.request("GET", url, headers=headers, data=payload) # Sample JSON response json_response = response.json() # Extracting 'data' for conversion data = json_response['data'] data_count = len(data) df = pd.json_normalize(data) df = df[df['status.name'].isin(['RETURN TO CLIENT', 'DELIVERED'])] print("data collected from page : "+str(page)) #data.to_csv("new.csv") try: file_path = f'model/{Tenant}trainer_data_v1.csv' # Replace with your file path source_csv = pd.read_csv(file_path) new_data = df combined_df_final = pd.concat([source_csv,new_data], ignore_index=True) combined_df_final.to_csv(f"model/{Tenant}trainer_data_v1.csv") print("data added") message = "data added" except: df.to_csv(f"model/{Tenant}trainer_data_v1.csv") print("data created") message = "data created" return {"message":message,"page_number":page,"data_count":data_count,'X-Tenant': Tenant} @app.get("/trigger_the_data_fecher") async def your_continuous_function(page: int,paginate: int,Tenant: str): print("data fetcher running.....") # Update the payload for each page #url = "https://dev3.api.curfox.parallaxtec.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page) url = "https://v1.api.curfox.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page) payload = {} headers = { 'Accept': 'application/json', 'X-Tenant': Tenant #'royalexpress' } response = requests.request("GET", url, headers=headers, data=payload) # Sample JSON response json_response = response.json() # Extracting 'data' for conversion data = json_response['data'] data_count = len(data) df = pd.json_normalize(data) df = df[df['status.name'].isin(['RETURN TO CLIENT', 'DELIVERED'])] print("data collected from page : "+str(page)) #data.to_csv("new.csv") try: file_path = f'model/{Tenant}trainer_data_.csv' # Replace with your file path source_csv = pd.read_csv(file_path) new_data = df combined_df_final = pd.concat([source_csv,new_data], ignore_index=True) combined_df_final.to_csv(f"model/{Tenant}trainer_data_.csv") print("data added") except: df.to_csv(f"model/{Tenant}trainer_data_.csv") print("data created") return {"message":"done","page_number":page,"data_count":data_count,'X-Tenant': Tenant} @app.get("/get_latest_model_updated_time") async def model_updated_time(Tenant: str): import multiprocessing # Get the number of available CPU cores available_cores = multiprocessing.cpu_count() try: m_time_encoder = os.path.getmtime(f'model/{Tenant}_curfox_encoders.joblib') m_time_model = os.path.getmtime(f'model/{Tenant}_curfox_xgb_model.joblib') return { "Tenant":Tenant, "base model created time ":datetime.datetime.fromtimestamp(m_time_encoder), "last model updated time":datetime.datetime.fromtimestamp(m_time_model), "Number of available CPU cores": available_cores } except: return {"no model found so first trained the model using data fecther"} # Endpoint for making predictions @app.post("/predict") def predict( Tenant: str, customer_name: str, customer_address: str, customer_phone: str, cod:str, weight: str, origin_city_name: str, destination_city_name: str, created_at: str, customer_email: str, pickup_address: str, origin_country: str ): try: # Load your trained model and encoders xgb_model = load(f'model/{Tenant}_curfox_xgb_model.joblib') encoders = load(f'model/{Tenant}_curfox_encoders.joblib') except: return {"no model found so first trained the model using data fecther"} # Function to handle unseen labels during encoding def safe_transform(encoder, column): classes = encoder.classes_ return [encoder.transform([x])[0] if x in classes else -1 for x in column] # Function to handle unseen labels during encoding def safe_transform(encoder, column): classes = encoder.classes_ return [encoder.transform([x])[0] if x in classes else -1 for x in column] input_data = { 'customer_name': customer_name, 'customer_address': customer_address, 'customer_phone': customer_phone, #'customer_email': customer_email, 'cod': int(cod), 'weight': int(weight), 'origin_city.name':origin_city_name, 'destination_city.name':destination_city_name, 'created_at':created_at } input_df = pd.DataFrame([input_data]) # Encode categorical variables using the same encoders used during training for col in input_df.columns: if col in encoders: input_df[col] = safe_transform(encoders[col], input_df[col]) # Predict and obtain probabilities pred = xgb_model.predict(input_df) pred_proba = xgb_model.predict_proba(input_df) # Output predicted_status = "Unknown" if pred[0] == -1 else encoders['status.name'].inverse_transform([pred])[0] probability = pred_proba[0][pred[0]] * 100 if pred[0] != -1 else "Unknown" if predicted_status == "RETURN TO CLIENT": probability = 100 - probability return {"predicted_status":predicted_status,"Probability": round(probability,2),"Tenant_new":Tenant}