Spaces:
Sleeping
Sleeping
import asyncio | |
from fastapi import FastAPI | |
from fastapi.middleware.cors import CORSMiddleware | |
import requests | |
import pandas as pd | |
import json | |
import httpx | |
import pandas as pd | |
from sklearn.model_selection import train_test_split, GridSearchCV | |
from sklearn.preprocessing import LabelEncoder | |
from xgboost import XGBClassifier | |
from sklearn.metrics import accuracy_score, classification_report | |
from joblib import dump, load | |
import numpy as np | |
app = FastAPI() | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
def train_the_model(data,page): | |
if page==2: | |
# Function to evaluate the model | |
def evaluate_model(model, X_test, y_test): | |
y_pred = model.predict(X_test) | |
accuracy = accuracy_score(y_test, y_pred) | |
print("Accuracy:", accuracy) | |
print("Classification Report:\n", classification_report(y_test, y_pred)) | |
return accuracy | |
new_data = data | |
encoders = load('encoders.joblib') | |
xgb_model = load('xgb_model.joblib') | |
selected_columns = ['customer_name', 'customer_address', 'customer_phone', | |
'customer_email', 'cod', 'weight', 'origin_city.name', | |
'destination_city.name', 'status.name'] | |
new_data_filled = new_data[selected_columns].fillna('Missing') | |
for col, encoder in encoders.items(): | |
if col in new_data_filled.columns: | |
unseen_categories = set(new_data_filled[col]) - set(encoder.classes_) | |
if unseen_categories: | |
for category in unseen_categories: | |
encoder.classes_ = np.append(encoder.classes_, category) | |
new_data_filled[col] = encoder.transform(new_data_filled[col]) | |
else: | |
new_data_filled[col] = encoder.transform(new_data_filled[col]) | |
X_new = new_data_filled.drop('status.name', axis=1) | |
y_new = new_data_filled['status.name'] | |
xgb_model.fit(X_new, y_new) | |
dump(xgb_model, 'xgb_model.joblib') | |
print("Model updated with new data.") | |
updated_model_accuracy = evaluate_model(xgb_model, X_test, y_test) | |
print("Updated model accuracy:", updated_model_accuracy) | |
else: | |
data = data | |
# Select columns | |
selected_columns = ['customer_name', 'customer_address', 'customer_phone', | |
'customer_email', 'cod', 'weight', | |
'origin_city.name', 'destination_city.name', 'status.name'] | |
# Handling missing values | |
data_filled = data[selected_columns].fillna('Missing') | |
# Encoding categorical variables | |
encoders = {col: LabelEncoder() for col in selected_columns if data_filled[col].dtype == 'object'} | |
for col, encoder in encoders.items(): | |
data_filled[col] = encoder.fit_transform(data_filled[col]) | |
# Splitting the dataset | |
X = data_filled.drop('status.name', axis=1) | |
y = data_filled['status.name'] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
# Setup the hyperparameter grid to search | |
param_grid = { | |
'max_depth': [3, 4, 5], | |
'learning_rate': [0.01, 0.1, 0.4], | |
'n_estimators': [100, 200, 300], | |
'subsample': [0.8, 0.9, 1], | |
'colsample_bytree': [0.3, 0.7] | |
} | |
# Initialize the classifier | |
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss') | |
# Setup GridSearchCV | |
grid_search = GridSearchCV(xgb, param_grid, cv=2, n_jobs=-1, scoring='accuracy') | |
# Fit the grid search to the data | |
grid_search.fit(X_train, y_train) | |
# Get the best parameters | |
best_params = grid_search.best_params_ | |
print("Best parameters:", best_params) | |
# Train the model with best parameters | |
best_xgb = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss') | |
best_xgb.fit(X_train, y_train) | |
# Predict on the test set | |
y_pred = best_xgb.predict(X_test) | |
y_pred_proba = best_xgb.predict_proba(X_test) | |
# Evaluate the model | |
accuracy = accuracy_score(y_test, y_pred) | |
classification_rep = classification_report(y_test, y_pred) | |
# Print the results | |
print("Accuracy:", accuracy) | |
print("Classification Report:\n", classification_report(y_test, y_pred)) | |
# Save the model | |
model_filename = 'xgb_model.joblib' | |
dump(best_xgb, model_filename) | |
# Save the encoders | |
encoders_filename = 'encoders.joblib' | |
dump(encoders, encoders_filename) | |
print(f"Model saved as {model_filename}") | |
print(f"Encoders saved as {encoders_filename}") | |
print("new base model trained") | |
async def your_continuous_function(page: int,paginate: int,Tenant: str): | |
print("data fetcher running.....") | |
# Initialize an empty DataFrame to store the combined data | |
combined_df = pd.DataFrame() | |
# Update the payload for each page | |
url = "https://dev3.api.curfox.parallaxtec.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page) | |
payload = {} | |
headers = { | |
'Accept': 'application/json', | |
'X-Tenant': Tenant #'royalexpress' | |
} | |
response = requests.request("GET", url, headers=headers, data=payload) | |
# Sample JSON response | |
json_response = response.json() | |
# Extracting 'data' for conversion | |
data = json_response['data'] | |
data_count = len(data) | |
df = pd.json_normalize(data) | |
# Concatenate the current page's DataFrame with the combined DataFrame | |
combined_df = pd.concat([combined_df, df], ignore_index=True) | |
data = combined_df[combined_df['status.name'].isin(['RETURN TO CLIENT', 'DELIVERED'])] | |
print("data collected from page : "+str(page)) | |
#data.to_csv("new.csv") | |
train_the_model(data,page) | |
return "model trained with page number: "+str(page)+" data count :"+str(data_count) | |
async def test_api(): | |
encoders = load('encoders.joblib') | |
xgb_model = load('xgb_model.joblib') |