Spaces:

Chittrarasu
/

Cricket-match-prediction-FastAPI

Sleeping

App Files Files Community

sivapriya175 commited on Mar 19

Commit

27e29a2

1 Parent(s): 9f8cfb6

deploy backend files

Browse files

Files changed (6) hide show

__pycache__/routers.cpython-313.pyc +0 -0
__pycache__/services.cpython-313.pyc +0 -0
main.py +6 -1
models/__pycache__/train_model.cpython-313.pyc +0 -0
models/train_model.py +217 -57
services.py +233 -62

__pycache__/routers.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/routers.cpython-313.pyc and b/__pycache__/routers.cpython-313.pyc differ

__pycache__/services.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/services.cpython-313.pyc and b/__pycache__/services.cpython-313.pyc differ

main.py CHANGED Viewed

@@ -1,10 +1,15 @@
 from fastapi import FastAPI
 from routers import router
 app = FastAPI(title="Cricket Statistics API")
 app.include_router(router)
 @app.get("/")
 async def root():
-    return {"message": "Welcome to the Cricket Statistics API"}

 from fastapi import FastAPI
 from routers import router
+from services import initialize_models
 app = FastAPI(title="Cricket Statistics API")
 app.include_router(router)
+@app.on_event("startup")
+async def startup_event():
+    initialize_models()  # Train models at startup
 @app.get("/")
 async def root():
+    return {"message": "Welcome to the Cricket Statistics API"}

models/__pycache__/train_model.cpython-313.pyc CHANGED Viewed

Binary files a/models/__pycache__/train_model.cpython-313.pyc and b/models/__pycache__/train_model.cpython-313.pyc differ

models/train_model.py CHANGED Viewed

@@ -6,82 +6,242 @@ from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
 from sklearn.preprocessing import StandardScaler
-# 🔹 Load datasets
-ball_df = pd.read_csv('data/cleaned_ball_data.csv')
-match_df = pd.read_csv('data/cleaned_match_data.csv')
-# Convert date columns
-match_df['date'] = pd.to_datetime(match_df['date'], errors='coerce')
-ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce')
-# 🔹 Compute team total scores and merge correctly
-team_scores = ball_df.groupby(['match_id', 'batting_team'])['total_runs'].sum().reset_index()
-team_scores.rename(columns={'total_runs': 'team_total'}, inplace=True)
-# Merge team scores with match_df
-match_df = match_df.merge(team_scores, left_on=['id', 'team1'], right_on=['match_id', 'batting_team'], how='left')
-match_df.rename(columns={'team_total': 'team1_total'}, inplace=True)
-match_df = match_df.merge(team_scores, left_on=['id', 'team2'], right_on=['match_id', 'batting_team'], how='left')
-match_df.rename(columns={'team_total': 'team2_total'}, inplace=True)
-# Fill missing values with 0 to avoid KeyError
-match_df['team1_total'] = match_df['team1_total'].fillna(0)
-match_df['team2_total'] = match_df['team2_total'].fillna(0)
-# Drop unnecessary columns
-match_df.drop(columns=['batting_team', 'match_id'], errors='ignore', inplace=True)
-# 🔹 Train Player Score Model
-def train_player_score_model():
-    player_runs = ball_df.groupby(['match_id', 'striker'])['runs_off_bat'].sum().reset_index()
     player_runs.rename(columns={'runs_off_bat': 'player_total'}, inplace=True)
     player_data = player_runs.merge(match_df, left_on='match_id', right_on='id', how='left')
-    # Feature Engineering
     player_data['player_avg'] = player_data.groupby('striker')['player_total'].transform('mean')
     player_data['venue_index'] = player_data['venue'].astype('category').cat.codes
     player_data['city_index'] = player_data['city'].astype('category').cat.codes
     player_data['toss_winner_index'] = player_data['toss_winner'].astype('category').cat.codes
     player_data['toss_decision_index'] = player_data['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
-    # Features and Target
-    X = player_data[['player_avg', 'venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index']].dropna()
     y = player_data.loc[X.index, 'player_total']
     # Scale features
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X)
-    # Train Model
-    model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
-    model.fit(X_scaled, y)
-    return model, scaler
-# 🔹 Train Team Performance Model
-def train_team_performance_model():
-    data = match_df[['team1', 'team2', 'winner', 'team1_total', 'team2_total', 'venue', 'city', 'toss_winner', 'toss_decision']].dropna()
-    data['team1_index'] = data['team1'].astype('category').cat.codes
-    data['team2_index'] = data['team2'].astype('category').cat.codes
-    data['winner_index'] = (data['winner'] == data['team1']).astype(int)
-    # Features and targets
-    X = data[['team1_index', 'team2_index']]
-    y_win = data['winner_index']
-    y_score = data[['team1_total', 'team2_total']]
-    # Train Team Win Prediction Model
-    win_model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42)
-    win_model.fit(X, y_win)
-    # Train Score Prediction Model
-    base_score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
-    score_model = MultiOutputRegressor(base_score_model)
-    score_model.fit(X, y_score)
-    return win_model, score_model
-# 🔹 Train models dynamically
-player_score_model, player_scaler = train_player_score_model()
-team_win_model, team_score_model = train_team_performance_model()

 from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
 from sklearn.preprocessing import StandardScaler
+# Load and preprocess data (same as original)
+def load_and_preprocess_data():
+    # Load datasets with exact column names
+    ball_df = pd.read_csv('data/cleaned_ball_data.csv',
+                          dtype={
+                              'match_id': str, 'season': str, 'start_date': str, 'venue': str,
+                              'innings': int, 'ball': float, 'batting_team': str, 'bowling_team': str,
+                              'striker': str, 'non_striker': str, 'bowler': str, 'runs_off_bat': int,
+                              'extras': int, 'wides': float, 'noballs': float, 'byes': float,
+                              'legbyes': float, 'penalty': float, 'wicket_type': str,
+                              'player_dismissed': str, 'other_wicket_type': str,
+                              'other_player_dismissed': str, 'cricsheet_id': str, 'total_runs': int
+                          })
+    match_df = pd.read_csv('data/cleaned_match_data.csv',
+                           dtype={
+                               'id': str, 'season': str, 'city': str, 'date': str,
+                               'team1': str, 'team2': str, 'toss_winner': str, 'toss_decision': str,
+                               'result': str, 'dl_applied': int, 'winner': str,
+                               'win_by_runs': float, 'win_by_wickets': float, 'player_of_match': str,
+                               'venue': str, 'umpire1': str, 'umpire2': str, 'umpire3': str
+                           })
+    # Convert date columns to datetime
+    match_df['date'] = pd.to_datetime(match_df['date'], errors='coerce')
+    ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce')
+    # Filter for ODI matches
+    odi_date_mask = (match_df['date'].dt.year >= 2015) & (match_df['date'].dt.year <= 2022)
+    match_df = match_df[odi_date_mask].copy()
+    # Compute team total scores
+    team_scores = ball_df.groupby(['match_id', 'batting_team'])['total_runs'].sum().reset_index()
+    team_scores.rename(columns={'total_runs': 'team_total'}, inplace=True)
+    # Merge scores into match_df
+    match_df = match_df.merge(team_scores, left_on=['id', 'team1'], right_on=['match_id', 'batting_team'], how='left')
+    match_df.rename(columns={'team_total': 'team1_total'}, inplace=True)
+    match_df['team1_total'] = match_df['team1_total'].fillna(match_df['team1_total'].mean())
+    match_df = match_df.merge(team_scores, left_on=['id', 'team2'], right_on=['match_id', 'batting_team'], how='left')
+    match_df.rename(columns={'team_total': 'team2_total'}, inplace=True)
+    match_df['team2_total'] = match_df['team2_total'].fillna(match_df['team2_total'].mean())
+    match_df.drop(columns=['batting_team', 'match_id'], errors='ignore', inplace=True)
+    # Add venue and city indices
+    match_df['venue_index'] = match_df['venue'].astype('category').cat.codes
+    match_df['city_index'] = match_df['city'].astype('category').cat.codes
+    # Add toss features
+    match_df['toss_winner_index'] = match_df['toss_winner'].astype('category').cat.codes
+    match_df['toss_decision_index'] = match_df['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
+    # Compute historical win rates
+    match_df['date_numeric'] = (match_df['date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1d')
+    max_date = match_df['date_numeric'].max()
+    team1_wins = match_df[match_df['winner'] == match_df['team1']].groupby('team1').agg({'date_numeric': 'mean', 'id': 'count'}).reset_index()
+    team1_wins.rename(columns={'id': 'wins', 'date_numeric': 'win_date', 'team1': 'team'}, inplace=True)
+    team2_wins = match_df[match_df['winner'] == match_df['team2']].groupby('team2').agg({'date_numeric': 'mean', 'id': 'count'}).reset_index()
+    team2_wins.rename(columns={'id': 'wins', 'date_numeric': 'win_date', 'team2': 'team'}, inplace=True)
+    team_wins = pd.concat([team1_wins, team2_wins]).groupby('team').agg({'wins': 'sum', 'win_date': 'mean'}).reset_index()
+    team1_matches = match_df.groupby('team1').size().reset_index(name='matches')
+    team1_matches.rename(columns={'team1': 'team'}, inplace=True)
+    team2_matches = match_df.groupby('team2').size().reset_index(name='matches')
+    team2_matches.rename(columns={'team2': 'team'}, inplace=True)
+    team_matches = pd.concat([team1_matches, team2_matches]).groupby('team')['matches'].sum().reset_index()
+    team_win_rates = team_matches.merge(team_wins, on='team', how='left').fillna(0)
+    team_win_rates['weighted_wins'] = team_win_rates.apply(lambda x: x['wins'] * np.exp(-0.1 * (max_date - x['win_date']) / 365) if pd.notna(x['win_date']) else 0, axis=1)
+    team_win_rates['win_rate'] = team_win_rates['weighted_wins'] / team_win_rates['matches']
+    team_win_rates['win_rate'] = team_win_rates['win_rate'].fillna(0)
+    match_df = match_df.merge(team_win_rates[['team', 'win_rate']].rename(columns={'team': 'team1', 'win_rate': 'team1_win_rate'}), on='team1', how='left')
+    match_df = match_df.merge(team_win_rates[['team', 'win_rate']].rename(columns={'team': 'team2', 'win_rate': 'team2_win_rate'}), on='team2', how='left')
+    # Compute head-to-head win rates
+    head_to_head = match_df[match_df['team1'].isin(match_df['team1'].unique()) & match_df['team2'].isin(match_df['team2'].unique())]
+    head_to_head_wins = head_to_head[head_to_head['winner'] == head_to_head['team1']].groupby(['team1', 'team2']).size().reset_index(name='h2h_wins')
+    head_to_head_matches = head_to_head.groupby(['team1', 'team2']).size().reset_index(name='h2h_matches')
+    h2h_win_rates = head_to_head_matches.merge(head_to_head_wins, on=['team1', 'team2'], how='left').fillna(0)
+    h2h_win_rates = h2h_win_rates[head_to_head_matches['h2h_matches'] >= 1]
+    h2h_win_rates['h2h_win_rate'] = h2h_win_rates['h2h_wins'] / h2h_win_rates['h2h_matches']
+    match_df = match_df.merge(h2h_win_rates[['team1', 'team2', 'h2h_win_rate']], on=['team1', 'team2'], how='left').fillna(0)
+    # Cap outliers
+    match_df['team1_total'] = match_df['team1_total'].clip(upper=500)
+    match_df['team2_total'] = match_df['team2_total'].clip(upper=500)
+    return match_df, ball_df
+# Train team performance model and return it
+def train_team_performance_model(match_df):
+    data = match_df[['team1', 'team2', 'winner', 'team1_total', 'team2_total', 'venue_index', 'city_index',
+                     'toss_winner_index', 'toss_decision_index', 'dl_applied', 'team1_win_rate',
+                     'team2_win_rate', 'h2h_win_rate']].dropna()
+    # Convert categorical teams to numerical indices
+    data['team1_index'] = data['team1'].astype('category').cat.codes
+    data['team2_index'] = data['team2'].astype('category').cat.codes
+    data['winner_index'] = (data['winner'] == data['team1']).astype(int)
+    # Features and targets
+    X = pd.DataFrame()
+    X['team1_index'] = data['team1_index']
+    X['team2_index'] = data['team2_index']
+    X['venue_index'] = data['venue_index']
+    X['city_index'] = data['city_index']
+    X['toss_winner_index'] = data['toss_winner_index']
+    X['toss_decision_index'] = data['toss_decision_index']
+    X['dl_applied'] = data['dl_applied']
+    X['team1_win_rate'] = data['team1_win_rate']
+    X['team2_win_rate'] = data['team2_win_rate']
+    X['h2h_win_rate'] = data['h2h_win_rate'] * 2
+    y_win = data['winner_index']
+    y_score = data[['team1_total', 'team2_total']]
+    # Scale features
+    scaler = StandardScaler()
+    scaled_features = scaler.fit_transform(X[['venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index',
+                                             'dl_applied', 'team1_win_rate', 'team2_win_rate', 'h2h_win_rate']])
+    X_scaled = pd.DataFrame(scaled_features, columns=['venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index',
+                                                     'dl_applied', 'team1_win_rate', 'team2_win_rate', 'h2h_win_rate'])
+    X_scaled['team1_index'] = X['team1_index']
+    X_scaled['team2_index'] = X['team2_index']
+    # Train/test split
+    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_win, test_size=0.2, random_state=42)
+    # Train win model
+    win_model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, class_weight='balanced')
+    win_model.fit(X_train, y_train)
+    # Train score model
+    base_score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
+    score_model = MultiOutputRegressor(base_score_model)
+    score_model.fit(X_scaled, y_score)
+    return win_model, score_model, data, scaler
+# Train player score model and return it
+def train_player_score_model(match_df, ball_df):
+    player_runs = ball_df.groupby(['match_id', 'striker', 'batting_team'])['runs_off_bat'].sum().reset_index()
     player_runs.rename(columns={'runs_off_bat': 'player_total'}, inplace=True)
     player_data = player_runs.merge(match_df, left_on='match_id', right_on='id', how='left')
+    # Feature engineering
     player_data['player_avg'] = player_data.groupby('striker')['player_total'].transform('mean')
+    player_data['team_win_rate'] = player_data.apply(lambda x: player_data[player_data['team1'] == x['batting_team']]['team1_win_rate'].mean()
+                                                    if x['batting_team'] == x['team1'] else player_data[player_data['team2'] == x['batting_team']]['team2_win_rate'].mean(), axis=1)
     player_data['venue_index'] = player_data['venue'].astype('category').cat.codes
     player_data['city_index'] = player_data['city'].astype('category').cat.codes
     player_data['toss_winner_index'] = player_data['toss_winner'].astype('category').cat.codes
     player_data['toss_decision_index'] = player_data['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
+    # Features and target
+    X = player_data[['player_avg', 'team_win_rate', 'venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index']].dropna()
     y = player_data.loc[X.index, 'player_total']
     # Scale features
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X)
+    # Train model
+    score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
+    score_model.fit(X_scaled, y)
+    return score_model, scaler, player_data
+# Prediction functions (unchanged except removing joblib.load)
+def predict_player_score(player, team, opponent, venue=None, city=None, toss_winner=None, toss_decision=None,
+                        score_model=None, scaler=None, player_data=None):
+    try:
+        if player not in player_data['striker'].values or team not in player_data['batting_team'].values:
+            raise ValueError("Player or team not found in training data")
+        player_avg = player_data[player_data['striker'] == player]['player_total'].mean()
+        team_win_rate = player_data[player_data['batting_team'] == team]['team_win_rate'].mean()
+        venue_index = player_data[player_data['venue'] == venue]['venue_index'].values[0] if venue else player_data['venue_index'].mean()
+        city_index = player_data[player_data['city'] == city]['city_index'].values[0] if city else player_data['city_index'].mean()
+        toss_winner_index = player_data[player_data['toss_winner'] == toss_winner]['toss_winner_index'].values[0] if toss_winner else player_data['toss_winner_index'].mean()
+        toss_decision_index = 1 if toss_decision == 'bat' else 0 if toss_decision == 'field' else player_data['toss_decision_index'].mean()
+        features = scaler.transform([[player_avg, team_win_rate, venue_index, city_index, toss_winner_index, toss_decision_index]])
+        predicted_score = score_model.predict(features)[0]
+        return {
+            "player": player,
+            "team": team,
+            "opponent": opponent,
+            "expected_score": round(predicted_score, 2)
+        }
+    except Exception as e:
+        print(f"Prediction error: {str(e)}")
+        return {
+            "player": player,
+            "team": team,
+            "opponent": opponent,
+            "expected_score": 0.0
+        }
+def predict_team_performance(team1, team2, venue=None, city=None, toss_winner=None, toss_decision=None,
+                             win_model=None, score_model=None, data=None, scaler=None):
+    try:
+        if team1 not in data['team1'].values or team2 not in data['team2'].values:
+            raise ValueError("Team not found in training data")
+        team1_index = data[data['team1'] == team1]['team1_index'].values[0]
+        team2_index = data[data['team2'] == team2]['team2_index'].values[0]
+        venue_index = data[data['venue'] == venue]['venue_index'].values[0] if venue else data['venue_index'].mean()
+        city_index = data[data['city'] == city]['city_index'].values[0] if city else data['city_index'].mean()
+        toss_winner_index = data[data['toss_winner'] == toss_winner]['toss_winner_index'].values[0] if toss_winner else data['toss_winner_index'].mean()
+        toss_decision_index = 1 if toss_decision == 'bat' else 0 if toss_decision == 'field' else data['toss_decision_index'].mean()
+        dl_applied = 0 if pd.isna(toss_decision) else data['dl_applied'].mean()
+        team1_win_rate = data[data['team1'] == team1]['team1_win_rate'].values[0]
+        team2_win_rate = data[data['team2'] == team2]['team2_win_rate'].values[0]
+        h2h_win_rate = data[(data['team1'] == team1) & (data['team2'] == team2)]['h2h_win_rate'].values[0] if not data[(data['team1'] == team1) & (data['team2'] == team2)].empty else 0
+        features = scaler.transform([[venue_index, city_index, toss_winner_index, toss_decision_index, dl_applied,
+                                     team1_win_rate, team2_win_rate, h2h_win_rate]])
+        win_probability = win_model.predict_proba([[team1_index, team2_index, features[0][0], features[0][1],
+                                                   features[0][2], features[0][3], features[0][4], features[0][5],
+                                                   features[0][6], features[0][7]]])[:, 1][0] * 100
+        predicted_scores = score_model.predict([[team1_index, team2_index, features[0][0], features[0][1],
+                                                features[0][2], features[0][3], features[0][4], features[0][5],
+                                                features[0][6], features[0][7]]])[0]
+        return {
+            "team1": team1,
+            "team2": team2,
+            "win_probability_team1": round(win_probability, 2),
+            "expected_team1_score": round(predicted_scores[0], 2),
+            "expected_team2_score": round(predicted_scores[1], 2)
+        }
+    except Exception as e:
+        print(f"Prediction error: {str(e)}")
+        return {
+            "team1": team1,
+            "team2": team2,
+            "win_probability_team1": 50.0,
+            "expected_team1_score": 0.0,
+            "expected_team2_score": 0.0
+        }

services.py CHANGED Viewed

@@ -1,37 +1,80 @@
-import os
 import pandas as pd
 import numpy as np
 from fastapi import HTTPException
-from models.train_model import train_player_score_model, train_team_performance_model  # No .pkl files needed!
-from dotenv import load_dotenv
 from groq import Groq
-# Load environment variables for security
-load_dotenv()
-# 🔹 Secure API Key Storage (Avoid Hardcoding API Keys)
-GROQ_API_KEY = os.getenv("GROQ_API_KEY")
-if not GROQ_API_KEY:
-    raise ValueError("Missing GROQ API key. Set it in environment variables.")
 client = Groq(api_key=GROQ_API_KEY)
-# 🔹 Load datasets
-match_df = pd.read_csv('data/cleaned_match_data.csv')
-ball_df = pd.read_csv('data/cleaned_ball_data.csv', low_memory=False)
-# 🔹 Train models dynamically (No `.pkl` files!)
-player_score_model, player_scaler = train_player_score_model()
-team_win_model, team_score_model = train_team_performance_model()
-# 🔹 LLM Summary Generation (Groq AI)
 def generate_summary(data, context_type):
-    prompt = f"Summarize this {context_type} data in one sentence: {data}"
     try:
         chat_completion = client.chat.completions.create(
             model="mixtral-8x7b-32768",
-            messages=[{"role": "system", "content": "You are a concise cricket analyst."},
-                      {"role": "user", "content": prompt}],
             max_tokens=50,
             temperature=0.7
         )
@@ -39,61 +82,190 @@ def generate_summary(data, context_type):
     except Exception as e:
         return f"Summary unavailable due to error: {str(e)}"
-# 🔹 Predict Player Score (No `.pkl` file needed)
 def predict_score(player_name: str, opposition_team: str):
     try:
-        input_features = np.array([[50, 1, 2, 3, 1]])  # Example feature vector
-        input_features = player_scaler.transform(input_features)
-        predicted_runs = player_score_model.predict(input_features)[0]
         stats = {
             "player": player_name,
             "opposition": opposition_team,
-            "predicted_runs": round(predicted_runs, 2),
-            "summary": generate_summary(predicted_runs, "prediction_score")
         }
-        return stats
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error predicting score: {str(e)}")
-# 🔹 Predict Team Outcome (No `.pkl` file needed)
 def predict_team_outcome(team1: str, team2: str):
-    try:
-        input_features = np.array([[1, 2]])  # Example feature vector
-        win_probability = team_win_model.predict_proba(input_features)[:, 1][0] * 100
-        predicted_scores = team_score_model.predict(input_features)[0]
-        return {
-            "team1": team1,
-            "team2": team2,
-            "win_probability_team1": round(win_probability, 2),
-            "expected_team1_score": round(predicted_scores[0], 2),
-            "expected_team2_score": round(predicted_scores[1], 2),
-            "summary": generate_summary(win_probability, "prediction_team")
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error predicting team outcome: {str(e)}")
-# 🔹 Utility Functions
 def get_teams():
-    return {"teams": sorted(set(match_df['team1'].unique().tolist() + match_df['team2'].unique().tolist()))}
 def get_players():
-    return {"players": sorted(set(ball_df['striker'].dropna().unique().tolist()))}
 def get_seasons():
-    return {"seasons": ["All Seasons"] + sorted(match_df['season'].dropna().unique().tolist())}
-# 🔹 Get Team Trends Over Time
 def get_team_trends(team_name: str):
     team_name = team_name.strip().title()
-    team_matches = match_df[(match_df['team1'] == team_name) | (match_df['team2'] == team_name)]
     if team_matches.empty:
         raise HTTPException(status_code=404, detail="Team not found")
-    # Calculate win percentage and other metrics for each season
     trends = []
-    for season in match_df['season'].unique():
         season_matches = team_matches[team_matches['season'] == season]
         if not season_matches.empty:
             wins = season_matches[season_matches['winner'] == team_name].shape[0]
@@ -103,22 +275,21 @@ def get_team_trends(team_name: str):
                 "season": season,
                 "wins": wins,
                 "total_matches": total_matches,
-                "win_percentage": round(win_percentage, 2)
             })
-    return {"team_name": team_name, "trends": trends, "summary": generate_summary(trends, "team_trends")}
-# 🔹 Get Player Trends Over Time
 def get_player_trends(player_name: str, role: str = "Batting"):
     player_name = player_name.strip().title()
     name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
-    player_data = ball_df[ball_df['striker'].isin(name_variations) | ball_df['bowler'].isin(name_variations)]
     if player_data.empty:
         raise HTTPException(status_code=404, detail=f"Player '{player_name}' not found")
-    # Calculate performance metrics for each season
     trends = []
-    for season in ball_df['season'].unique():
         season_data = player_data[player_data['season'] == season]
         if not season_data.empty:
             if role == "Batting":
@@ -129,7 +300,7 @@ def get_player_trends(player_name: str, role: str = "Batting"):
                 trends.append({
                     "season": season,
                     "total_runs": total_runs,
-                    "strike_rate": round(strike_rate, 2),
                     "matches_played": matches_played
                 })
             elif role == "Bowling":
@@ -142,9 +313,9 @@ def get_player_trends(player_name: str, role: str = "Batting"):
                 trends.append({
                     "season": season,
                     "total_wickets": total_wickets,
-                    "bowling_average": round(bowling_average, 2),
-                    "economy_rate": round(economy_rate, 2),
                     "matches_played": matches_played
                 })
-    return {"player_name": player_name, "role": role, "trends": trends, "summary": generate_summary(trends, "player_trends")}

 import pandas as pd
 import numpy as np
 from fastapi import HTTPException
+from models.train_model import (
+    load_and_preprocess_data, train_team_performance_model, train_player_score_model,
+    predict_player_score, predict_team_performance
+)
 from groq import Groq
+# Global variables to store models and data
+TEAM_WIN_MODEL = None
+TEAM_SCORE_MODEL = None
+TEAM_DATA = None
+TEAM_SCALER = None
+PLAYER_SCORE_MODEL = None
+PLAYER_SCALER = None
+PLAYER_DATA = None
+MATCH_DF = None
+BALL_DF = None
+# Initialize Groq client
+GROQ_API_KEY = "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"
 client = Groq(api_key=GROQ_API_KEY)
+# Load data and train models at startup
+def initialize_models():
+    global TEAM_WIN_MODEL, TEAM_SCORE_MODEL, TEAM_DATA, TEAM_SCALER
+    global PLAYER_SCORE_MODEL, PLAYER_SCALER, PLAYER_DATA, MATCH_DF, BALL_DF
+    MATCH_DF, BALL_DF = load_and_preprocess_data()
+    TEAM_WIN_MODEL, TEAM_SCORE_MODEL, TEAM_DATA, TEAM_SCALER = train_team_performance_model(MATCH_DF)
+    PLAYER_SCORE_MODEL, PLAYER_SCALER, PLAYER_DATA = train_player_score_model(MATCH_DF, BALL_DF)
+    print("Models trained and loaded into memory.")
+# Call this at app startup (see main.py below)
+initialize_models()
+# Player-team mapping
+player_team_mapping = BALL_DF.groupby('striker')['batting_team'].agg(lambda x: x.mode()[0] if len(x.mode()) > 0 else None).to_dict()
+# Clean JSON data (unchanged)
+def clean_json(data):
+    if isinstance(data, dict):
+        return {k: clean_json(v) for k, v in data.items()}
+    elif isinstance(data, list):
+        return [clean_json(v) for v in data]
+    elif isinstance(data, float):
+        return 0.0 if pd.isna(data) or np.isinf(data) else data
+    elif pd.isna(data):
+        return None
+    elif isinstance(data, pd.Timestamp):
+        return data.strftime('%Y-%m-%d') if pd.notna(data) else None
+    elif isinstance(data, (int, bool)):
+        return data
+    return str(data)
+# Summary generation (unchanged)
 def generate_summary(data, context_type):
+    prompt = ""
+    if context_type == "player_stats":
+        prompt = f"Summarize this player data in one sentence: {data}"
+    elif context_type == "team_stats":
+        prompt = f"Summarize this team data in one sentence: {data}"
+    elif context_type == "match_history":
+        prompt = f"Summarize this match history between {data['team1']} and {data['team2']} in one sentence: {data['matches']}"
+    elif context_type == "prediction_score":
+        prompt = f"Summarize this prediction in one sentence: {data}"
+    elif context_type == "prediction_team":
+        prompt = f"Summarize this team prediction in one sentence: {data}"
     try:
         chat_completion = client.chat.completions.create(
             model="mixtral-8x7b-32768",
+            messages=[
+                {"role": "system", "content": "You are a concise cricket analyst."},
+                {"role": "user", "content": prompt}
+            ],
             max_tokens=50,
             temperature=0.7
         )
     except Exception as e:
         return f"Summary unavailable due to error: {str(e)}"
+# Player stats (unchanged except using global BALL_DF)
+def get_player_stats(player_name: str, season: str = None, role: str = "Batting"):
+    player_name = player_name.strip().title()
+    name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
+    player_data = BALL_DF[BALL_DF['striker'].isin(name_variations) | BALL_DF['bowler'].isin(name_variations)]
+    if season and 'season' in BALL_DF.columns:
+        player_data = player_data[player_data['season'] == season]
+    if player_data.empty:
+        raise HTTPException(status_code=404, detail=f"Player '{player_name}' not found. Variations tried: {name_variations}")
+    if role == "Batting":
+        batting_data = player_data[player_data['striker'].isin(name_variations)]
+        total_runs = int(batting_data['runs_off_bat'].sum())
+        balls_faced = int(batting_data.shape[0])
+        strike_rate = float((total_runs / balls_faced * 100) if balls_faced > 0 else 0)
+        matches_played = int(len(batting_data['match_id'].unique()))
+        stats = {
+            "player_name": player_name,
+            "role": role,
+            "total_runs": total_runs,
+            "balls_faced": balls_faced,
+            "strike_rate": strike_rate,
+            "matches_played": matches_played,
+            "season": season if season else "All Seasons"
+        }
+        stats["summary"] = generate_summary(stats, "player_stats")
+        return clean_json(stats)
+    elif role == "Bowling":
+        bowling_data = player_data[player_data['bowler'].isin(name_variations)]
+        bowler_wicket_types = ["caught", "bowled", "lbw", "caught and bowled", "hit wicket"]
+        wickets_data = bowling_data[bowling_data['player_dismissed'].notna() &
+                                   bowling_data['wicket_type'].isin(bowler_wicket_types)]
+        total_wickets = int(wickets_data.shape[0])
+        total_runs_conceded = int(bowling_data['total_runs'].sum())
+        total_balls_bowled = int(bowling_data.shape[0])
+        total_overs_bowled = float(total_balls_bowled / 6)
+        bowling_average = float(total_runs_conceded / total_wickets) if total_wickets > 0 else float('inf')
+        economy_rate = float(total_runs_conceded / total_overs_bowled) if total_overs_bowled > 0 else 0
+        bowling_strike_rate = float(total_balls_bowled / total_wickets) if total_wickets > 0 else float('inf')
+        bowling_matches = int(len(bowling_data['match_id'].unique()))
+        stats = {
+            "player_name": player_name,
+            "role": role,
+            "total_wickets": total_wickets,
+            "bowling_average": 0.0 if np.isinf(bowling_average) else round(bowling_average, 2),
+            "economy_rate": round(economy_rate, 2),
+            "bowling_strike_rate": 0.0 if np.isinf(bowling_strike_rate) else round(bowling_strike_rate, 2),
+            "overs_bowled": round(total_overs_bowled, 1),
+            "bowling_matches": bowling_matches,
+            "season": season if season else "All Seasons"
+        }
+        stats["summary"] = generate_summary(stats, "player_stats")
+        return clean_json(stats)
+# Team stats (unchanged except using global MATCH_DF)
+def get_team_stats(team_name: str, season: str = None):
+    team_name = team_name.strip().title()
+    team_matches = MATCH_DF[(MATCH_DF['team1'] == team_name) | (MATCH_DF['team2'] == team_name)]
+    if season and 'season' in MATCH_DF.columns:
+        team_matches = team_matches[team_matches['season'] == season]
+    if team_matches.empty:
+        raise HTTPException(status_code=404, detail="Team not found")
+    wins = int(team_matches[team_matches['winner'] == team_name].shape[0])
+    total_matches = int(team_matches.shape[0])
+    stats = {
+        "total_matches": total_matches,
+        "wins": wins,
+        "losses": total_matches - wins,
+        "win_percentage": float((wins / total_matches * 100) if total_matches > 0 else 0),
+        "season": season if season else "All Seasons"
+    }
+    stats["summary"] = generate_summary(stats, "team_stats")
+    return clean_json(stats)
+# Match history (unchanged except using global MATCH_DF)
+def get_match_history(team1: str, team2: str, season: str = None):
+    team1 = team1.strip().title()
+    team2 = team2.strip().title()
+    available_teams = set(MATCH_DF['team1'].unique().tolist() + MATCH_DF['team2'].unique().tolist())
+    if team1 not in available_teams or team2 not in available_teams:
+        raise HTTPException(status_code=404, detail=f"Team {team1 if team1 not in available_teams else team2} not found.")
+    team_matches = MATCH_DF[
+        ((MATCH_DF['team1'] == team1) & (MATCH_DF['team2'] == team2)) |
+        ((MATCH_DF['team1'] == team2) & (MATCH_DF['team2'] == team1))
+    ].copy()
+    if season and 'season' in MATCH_DF.columns:
+        team_matches = team_matches[team_matches['season'] == season]
+    if team_matches.empty:
+        raise HTTPException(status_code=404, detail=f"No match history found between {team1} and {team2}.")
+    team_matches['date'] = team_matches['date'].apply(lambda x: x.strftime('%Y-%m-%d') if pd.notna(x) else None)
+    team_matches['winner'] = team_matches['winner'].fillna("Draw")
+    for column in ['team1', 'team2', 'winner']:
+        team_matches[column] = team_matches[column].apply(lambda x: str(x) if pd.notna(x) else None)
+    history = team_matches[['date', 'team1', 'team2', 'winner']].to_dict(orient='records')
+    response = {
+        "team1": team1,
+        "team2": team2,
+        "season": season if season else "All Seasons",
+        "matches": history
+    }
+    response["summary"] = generate_summary(response, "match_history")
+    return clean_json(response)
+# Prediction functions using in-memory models
 def predict_score(player_name: str, opposition_team: str):
     try:
+        player_name = player_name.strip().replace("+", " ").title()
+        name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
+        player_team = None
+        for name in name_variations:
+            if name in player_team_mapping:
+                player_team = player_team_mapping[name]
+                player_name = name
+                break
+        if not player_team:
+            raise ValueError(f"Player {player_name} not found in historical data")
+        predicted_runs = predict_player_score(
+            player=player_name,
+            team=player_team,
+            opponent=opposition_team,
+            venue=None,
+            city=None,
+            toss_winner=None,
+            toss_decision=None,
+            score_model=PLAYER_SCORE_MODEL,
+            scaler=PLAYER_SCALER,
+            player_data=PLAYER_DATA
+        )
         stats = {
             "player": player_name,
+            "team": player_team,
             "opposition": opposition_team,
+            "predicted_runs": predicted_runs["expected_score"]
         }
+        stats["summary"] = generate_summary(stats, "prediction_score")
+        return clean_json(stats)
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error predicting score for {player_name} against {opposition_team}: {str(e)}")
 def predict_team_outcome(team1: str, team2: str):
+    prediction = predict_team_performance(
+        team1=team1,
+        team2=team2,
+        venue=None,
+        city=None,
+        toss_winner=None,
+        toss_decision=None,
+        win_model=TEAM_WIN_MODEL,
+        score_model=TEAM_SCORE_MODEL,
+        data=TEAM_DATA,
+        scaler=TEAM_SCALER
+    )
+    prediction["summary"] = generate_summary(prediction, "prediction_team")
+    return clean_json(prediction)
+# Utility functions (unchanged except using global dataframes)
 def get_teams():
+    return clean_json({"teams": sorted(set(MATCH_DF['team1'].unique().tolist() + MATCH_DF['team2'].unique().tolist()))})
 def get_players():
+    unique_players = sorted(set(BALL_DF['striker'].dropna().unique().tolist()))
+    return clean_json({"players": unique_players})
 def get_seasons():
+    return clean_json({"seasons": ["All Seasons"] + sorted(MATCH_DF['season'].dropna().unique().tolist())})
+# Team trends (unchanged except using global MATCH_DF)
 def get_team_trends(team_name: str):
     team_name = team_name.strip().title()
+    team_matches = MATCH_DF[(MATCH_DF['team1'] == team_name) | (MATCH_DF['team2'] == team_name)]
     if team_matches.empty:
         raise HTTPException(status_code=404, detail="Team not found")
     trends = []
+    for season in MATCH_DF['season'].unique():
         season_matches = team_matches[team_matches['season'] == season]
         if not season_matches.empty:
             wins = season_matches[season_matches['winner'] == team_name].shape[0]
                 "season": season,
                 "wins": wins,
                 "total_matches": total_matches,
+                "win_percentage": win_percentage
             })
+    return {"team_name": team_name, "trends": trends}
+# Player trends (unchanged except using global BALL_DF)
 def get_player_trends(player_name: str, role: str = "Batting"):
     player_name = player_name.strip().title()
     name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
+    player_data = BALL_DF[BALL_DF['striker'].isin(name_variations) | BALL_DF['bowler'].isin(name_variations)]
     if player_data.empty:
         raise HTTPException(status_code=404, detail=f"Player '{player_name}' not found")
     trends = []
+    for season in BALL_DF['season'].unique():
         season_data = player_data[player_data['season'] == season]
         if not season_data.empty:
             if role == "Batting":
                 trends.append({
                     "season": season,
                     "total_runs": total_runs,
+                    "strike_rate": strike_rate,
                     "matches_played": matches_played
                 })
             elif role == "Bowling":
                 trends.append({
                     "season": season,
                     "total_wickets": total_wickets,
+                    "bowling_average": bowling_average,
+                    "economy  economy_rate": economy_rate,
                     "matches_played": matches_played
                 })
+    return {"player_name": player_name, "role": role, "trends": trends}