Spaces:
Sleeping
Sleeping
import asyncio | |
from fastapi import FastAPI | |
from fastapi.middleware.cors import CORSMiddleware | |
import requests | |
import pandas as pd | |
import json | |
import httpx,os,datetime | |
import pandas as pd | |
from sklearn.model_selection import train_test_split, GridSearchCV | |
from sklearn.preprocessing import LabelEncoder | |
from xgboost import XGBClassifier | |
from sklearn.utils import resample | |
from sklearn.metrics import accuracy_score, classification_report | |
from joblib import dump, load | |
import numpy as np | |
app = FastAPI() | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
async def train_the_model(Tenant: str): | |
# Load the dataset | |
data = pd.read_csv(f"model/{Tenant}trainer_data_v1.csv") | |
print(data["customer_name"].count()) | |
# Analyze class distribution | |
class_distribution = data['status.name'].value_counts() | |
bf = str(class_distribution) | |
print("Class Distribution before balancing:\n", class_distribution) | |
# Get the size of the largest class to match other classes' sizes | |
max_class_size = class_distribution.max() | |
# Oversampling | |
oversampled_data = pd.DataFrame() | |
for class_name, group in data.groupby('status.name'): | |
oversampled_group = resample(group, | |
replace=True, # Sample with replacement | |
n_samples=max_class_size, # to match majority class | |
random_state=123) # for reproducibility | |
oversampled_data = pd.concat([oversampled_data, oversampled_group], axis=0) | |
# Verify new class distribution | |
print("Class Distribution after oversampling:\n", oversampled_data['status.name'].value_counts()) | |
data = oversampled_data | |
# Select columns 'customer_email' | |
selected_columns = ['customer_name', 'customer_address', 'customer_phone', | |
'cod', 'weight', 'origin_city.name', | |
'destination_city.name','status.name','created_at'] | |
# Handling missing values | |
#data_filled = data[selected_columns].fillna('Missing') | |
data_filled = data[selected_columns].dropna() | |
data_filled['customer_phone'] = data_filled['customer_phone'].astype(str) | |
data_filled['created_at'] = data_filled['created_at'].astype(str) | |
#data_filled = data_filled.drop(columns=['created_at']) | |
af = str(oversampled_data['status.name'].value_counts()) | |
# Encoding categorical variables | |
encoders = {col: LabelEncoder() for col in selected_columns if data_filled[col].dtype == 'object'} | |
for col, encoder in encoders.items(): | |
data_filled[col] = encoder.fit_transform(data_filled[col]) | |
# Splitting the dataset | |
X = data_filled.drop('status.name', axis=1) | |
y = data_filled['status.name'] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
# Parameters to use for the model | |
# Parameters to use for the model | |
"""params = { | |
'colsample_bytree': 0.3, | |
'learning_rate': 0.6, | |
'max_depth': 6, | |
'n_estimators': 100, | |
'subsample': 0.9, | |
'use_label_encoder': False, | |
'eval_metric': 'logloss' | |
}""" | |
params = { | |
'colsample_bytree': 0.9, | |
'learning_rate': 0.1, | |
'max_depth': 30, | |
'n_estimators': 600, | |
'subsample': 0.9, | |
'use_label_encoder': False, | |
'eval_metric': 'logloss' | |
} | |
# Initialize the classifier with the specified parameters | |
xgb = XGBClassifier(**params) | |
# Train the model | |
xgb.fit(X_train, y_train) | |
# Predict on the test set | |
y_pred = xgb.predict(X_test) | |
y_pred_proba = xgb.predict_proba(X_test) | |
# Evaluate the model | |
accuracy = accuracy_score(y_test, y_pred) | |
classification_rep = classification_report(y_test, y_pred) | |
# Save the model | |
model_filename = f'model/{Tenant}_curfox_xgb_model.joblib' | |
dump(xgb, model_filename) | |
# Save the encoders | |
encoders_filename = f'model/{Tenant}_curfox_encoders.joblib' | |
dump(encoders, encoders_filename) | |
return accuracy,classification_rep,"Model trained with new data for :",model_filename,str(af),str(bf) | |
async def continuous_function(page: int,paginate: int,Tenant: str): | |
print("data fetcher running.....") | |
# Update the payload for each page | |
#url = "https://dev3.api.curfox.parallaxtec.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page) | |
url = "https://v1.api.curfox.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page) | |
payload = {} | |
headers = { | |
'Accept': 'application/json', | |
'X-Tenant': Tenant #'royalexpress' | |
} | |
response = requests.request("GET", url, headers=headers, data=payload) | |
# Sample JSON response | |
json_response = response.json() | |
# Extracting 'data' for conversion | |
data = json_response['data'] | |
data_count = len(data) | |
df = pd.json_normalize(data) | |
df = df[df['status.name'].isin(['RETURN TO CLIENT', 'DELIVERED'])] | |
print("data collected from page : "+str(page)) | |
#data.to_csv("new.csv") | |
try: | |
file_path = f'model/{Tenant}trainer_data_v1.csv' # Replace with your file path | |
source_csv = pd.read_csv(file_path) | |
new_data = df | |
combined_df_final = pd.concat([source_csv,new_data], ignore_index=True) | |
combined_df_final.to_csv(f"model/{Tenant}trainer_data_v1.csv") | |
print("data added") | |
message = "data added" | |
except: | |
df.to_csv(f"model/{Tenant}trainer_data_v1.csv") | |
print("data created") | |
message = "data created" | |
return {"message":message,"page_number":page,"data_count":data_count,'X-Tenant': Tenant} | |
async def your_continuous_function(page: int,paginate: int,Tenant: str): | |
print("data fetcher running.....") | |
# Update the payload for each page | |
#url = "https://dev3.api.curfox.parallaxtec.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page) | |
url = "https://v1.api.curfox.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page) | |
payload = {} | |
headers = { | |
'Accept': 'application/json', | |
'X-Tenant': Tenant #'royalexpress' | |
} | |
response = requests.request("GET", url, headers=headers, data=payload) | |
# Sample JSON response | |
json_response = response.json() | |
# Extracting 'data' for conversion | |
data = json_response['data'] | |
data_count = len(data) | |
df = pd.json_normalize(data) | |
df = df[df['status.name'].isin(['RETURN TO CLIENT', 'DELIVERED'])] | |
print("data collected from page : "+str(page)) | |
#data.to_csv("new.csv") | |
try: | |
file_path = f'model/{Tenant}trainer_data_.csv' # Replace with your file path | |
source_csv = pd.read_csv(file_path) | |
new_data = df | |
combined_df_final = pd.concat([source_csv,new_data], ignore_index=True) | |
combined_df_final.to_csv(f"model/{Tenant}trainer_data_.csv") | |
print("data added") | |
except: | |
df.to_csv(f"model/{Tenant}trainer_data_.csv") | |
print("data created") | |
return {"message":"done","page_number":page,"data_count":data_count,'X-Tenant': Tenant} | |
async def model_updated_time(Tenant: str): | |
import multiprocessing | |
# Get the number of available CPU cores | |
available_cores = multiprocessing.cpu_count() | |
try: | |
m_time_encoder = os.path.getmtime(f'model/{Tenant}_curfox_encoders.joblib') | |
m_time_model = os.path.getmtime(f'model/{Tenant}_curfox_xgb_model.joblib') | |
return { | |
"Tenant":Tenant, | |
"base model created time ":datetime.datetime.fromtimestamp(m_time_encoder), | |
"last model updated time":datetime.datetime.fromtimestamp(m_time_model), | |
"Number of available CPU cores": available_cores | |
} | |
except: | |
return {"no model found so first trained the model using data fecther"} | |
# Endpoint for making predictions | |
def predict( | |
Tenant: str, | |
customer_name: str, | |
customer_address: str, | |
customer_phone: str, | |
cod:str, | |
weight: str, | |
origin_city_name: str, | |
destination_city_name: str, | |
created_at: str, | |
customer_email: str, | |
pickup_address: str, | |
origin_country: str | |
): | |
try: | |
# Load your trained model and encoders | |
xgb_model = load(f'model/{Tenant}_curfox_xgb_model.joblib') | |
encoders = load(f'model/{Tenant}_curfox_encoders.joblib') | |
except: | |
return {"no model found so first trained the model using data fecther"} | |
# Function to handle unseen labels during encoding | |
def safe_transform(encoder, column): | |
classes = encoder.classes_ | |
return [encoder.transform([x])[0] if x in classes else -1 for x in column] | |
# Function to handle unseen labels during encoding | |
def safe_transform(encoder, column): | |
classes = encoder.classes_ | |
return [encoder.transform([x])[0] if x in classes else -1 for x in column] | |
input_data = { | |
'customer_name': customer_name, | |
'customer_address': customer_address, | |
'customer_phone': customer_phone, #'customer_email': customer_email, | |
'cod': int(cod), | |
'weight': int(weight), | |
'origin_city.name':origin_city_name, | |
'destination_city.name':destination_city_name, | |
'created_at':created_at | |
} | |
input_df = pd.DataFrame([input_data]) | |
# Encode categorical variables using the same encoders used during training | |
for col in input_df.columns: | |
if col in encoders: | |
input_df[col] = safe_transform(encoders[col], input_df[col]) | |
# Predict and obtain probabilities | |
pred = xgb_model.predict(input_df) | |
pred_proba = xgb_model.predict_proba(input_df) | |
# Output | |
predicted_status = "Unknown" if pred[0] == -1 else encoders['status.name'].inverse_transform([pred])[0] | |
probability = pred_proba[0][pred[0]] * 100 if pred[0] != -1 else "Unknown" | |
if predicted_status == "RETURN TO CLIENT": | |
probability = 100 - probability | |
return {"predicted_status":predicted_status,"Probability": round(probability,2),"Tenant_new":Tenant} | |