sivapriya175 commited on
Commit
27e29a2
·
1 Parent(s): 9f8cfb6

deploy backend files

Browse files
__pycache__/routers.cpython-313.pyc CHANGED
Binary files a/__pycache__/routers.cpython-313.pyc and b/__pycache__/routers.cpython-313.pyc differ
 
__pycache__/services.cpython-313.pyc CHANGED
Binary files a/__pycache__/services.cpython-313.pyc and b/__pycache__/services.cpython-313.pyc differ
 
main.py CHANGED
@@ -1,10 +1,15 @@
1
  from fastapi import FastAPI
2
  from routers import router
 
3
 
4
  app = FastAPI(title="Cricket Statistics API")
5
 
6
  app.include_router(router)
7
 
 
 
 
 
8
  @app.get("/")
9
  async def root():
10
- return {"message": "Welcome to the Cricket Statistics API"}
 
1
  from fastapi import FastAPI
2
  from routers import router
3
+ from services import initialize_models
4
 
5
  app = FastAPI(title="Cricket Statistics API")
6
 
7
  app.include_router(router)
8
 
9
+ @app.on_event("startup")
10
+ async def startup_event():
11
+ initialize_models() # Train models at startup
12
+
13
  @app.get("/")
14
  async def root():
15
+ return {"message": "Welcome to the Cricket Statistics API"}
models/__pycache__/train_model.cpython-313.pyc CHANGED
Binary files a/models/__pycache__/train_model.cpython-313.pyc and b/models/__pycache__/train_model.cpython-313.pyc differ
 
models/train_model.py CHANGED
@@ -6,82 +6,242 @@ from sklearn.model_selection import train_test_split
6
  from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
7
  from sklearn.preprocessing import StandardScaler
8
 
9
- # 🔹 Load datasets
10
- ball_df = pd.read_csv('data/cleaned_ball_data.csv')
11
- match_df = pd.read_csv('data/cleaned_match_data.csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Convert date columns
14
- match_df['date'] = pd.to_datetime(match_df['date'], errors='coerce')
15
- ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce')
 
 
 
 
 
 
 
 
 
16
 
17
- # 🔹 Compute team total scores and merge correctly
18
- team_scores = ball_df.groupby(['match_id', 'batting_team'])['total_runs'].sum().reset_index()
19
- team_scores.rename(columns={'total_runs': 'team_total'}, inplace=True)
20
 
21
- # Merge team scores with match_df
22
- match_df = match_df.merge(team_scores, left_on=['id', 'team1'], right_on=['match_id', 'batting_team'], how='left')
23
- match_df.rename(columns={'team_total': 'team1_total'}, inplace=True)
 
 
 
 
 
 
 
 
24
 
25
- match_df = match_df.merge(team_scores, left_on=['id', 'team2'], right_on=['match_id', 'batting_team'], how='left')
26
- match_df.rename(columns={'team_total': 'team2_total'}, inplace=True)
 
27
 
28
- # Fill missing values with 0 to avoid KeyError
29
- match_df['team1_total'] = match_df['team1_total'].fillna(0)
30
- match_df['team2_total'] = match_df['team2_total'].fillna(0)
 
31
 
32
- # Drop unnecessary columns
33
- match_df.drop(columns=['batting_team', 'match_id'], errors='ignore', inplace=True)
34
 
35
- # 🔹 Train Player Score Model
36
- def train_player_score_model():
37
- player_runs = ball_df.groupby(['match_id', 'striker'])['runs_off_bat'].sum().reset_index()
38
  player_runs.rename(columns={'runs_off_bat': 'player_total'}, inplace=True)
39
  player_data = player_runs.merge(match_df, left_on='match_id', right_on='id', how='left')
40
 
41
- # Feature Engineering
42
  player_data['player_avg'] = player_data.groupby('striker')['player_total'].transform('mean')
 
 
43
  player_data['venue_index'] = player_data['venue'].astype('category').cat.codes
44
  player_data['city_index'] = player_data['city'].astype('category').cat.codes
45
  player_data['toss_winner_index'] = player_data['toss_winner'].astype('category').cat.codes
46
  player_data['toss_decision_index'] = player_data['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
47
 
48
- # Features and Target
49
- X = player_data[['player_avg', 'venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index']].dropna()
50
  y = player_data.loc[X.index, 'player_total']
51
 
52
  # Scale features
53
  scaler = StandardScaler()
54
  X_scaled = scaler.fit_transform(X)
55
 
56
- # Train Model
57
- model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
58
- model.fit(X_scaled, y)
59
-
60
- return model, scaler
61
-
62
- # 🔹 Train Team Performance Model
63
- def train_team_performance_model():
64
- data = match_df[['team1', 'team2', 'winner', 'team1_total', 'team2_total', 'venue', 'city', 'toss_winner', 'toss_decision']].dropna()
65
- data['team1_index'] = data['team1'].astype('category').cat.codes
66
- data['team2_index'] = data['team2'].astype('category').cat.codes
67
- data['winner_index'] = (data['winner'] == data['team1']).astype(int)
68
-
69
- # Features and targets
70
- X = data[['team1_index', 'team2_index']]
71
- y_win = data['winner_index']
72
- y_score = data[['team1_total', 'team2_total']]
73
-
74
- # Train Team Win Prediction Model
75
- win_model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42)
76
- win_model.fit(X, y_win)
77
-
78
- # Train Score Prediction Model
79
- base_score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
80
- score_model = MultiOutputRegressor(base_score_model)
81
- score_model.fit(X, y_score)
82
-
83
- return win_model, score_model
84
-
85
- # 🔹 Train models dynamically
86
- player_score_model, player_scaler = train_player_score_model()
87
- team_win_model, team_score_model = train_team_performance_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
7
  from sklearn.preprocessing import StandardScaler
8
 
9
+ # Load and preprocess data (same as original)
10
+ def load_and_preprocess_data():
11
+ # Load datasets with exact column names
12
+ ball_df = pd.read_csv('data/cleaned_ball_data.csv',
13
+ dtype={
14
+ 'match_id': str, 'season': str, 'start_date': str, 'venue': str,
15
+ 'innings': int, 'ball': float, 'batting_team': str, 'bowling_team': str,
16
+ 'striker': str, 'non_striker': str, 'bowler': str, 'runs_off_bat': int,
17
+ 'extras': int, 'wides': float, 'noballs': float, 'byes': float,
18
+ 'legbyes': float, 'penalty': float, 'wicket_type': str,
19
+ 'player_dismissed': str, 'other_wicket_type': str,
20
+ 'other_player_dismissed': str, 'cricsheet_id': str, 'total_runs': int
21
+ })
22
+ match_df = pd.read_csv('data/cleaned_match_data.csv',
23
+ dtype={
24
+ 'id': str, 'season': str, 'city': str, 'date': str,
25
+ 'team1': str, 'team2': str, 'toss_winner': str, 'toss_decision': str,
26
+ 'result': str, 'dl_applied': int, 'winner': str,
27
+ 'win_by_runs': float, 'win_by_wickets': float, 'player_of_match': str,
28
+ 'venue': str, 'umpire1': str, 'umpire2': str, 'umpire3': str
29
+ })
30
+
31
+ # Convert date columns to datetime
32
+ match_df['date'] = pd.to_datetime(match_df['date'], errors='coerce')
33
+ ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce')
34
+
35
+ # Filter for ODI matches
36
+ odi_date_mask = (match_df['date'].dt.year >= 2015) & (match_df['date'].dt.year <= 2022)
37
+ match_df = match_df[odi_date_mask].copy()
38
+
39
+ # Compute team total scores
40
+ team_scores = ball_df.groupby(['match_id', 'batting_team'])['total_runs'].sum().reset_index()
41
+ team_scores.rename(columns={'total_runs': 'team_total'}, inplace=True)
42
+
43
+ # Merge scores into match_df
44
+ match_df = match_df.merge(team_scores, left_on=['id', 'team1'], right_on=['match_id', 'batting_team'], how='left')
45
+ match_df.rename(columns={'team_total': 'team1_total'}, inplace=True)
46
+ match_df['team1_total'] = match_df['team1_total'].fillna(match_df['team1_total'].mean())
47
+ match_df = match_df.merge(team_scores, left_on=['id', 'team2'], right_on=['match_id', 'batting_team'], how='left')
48
+ match_df.rename(columns={'team_total': 'team2_total'}, inplace=True)
49
+ match_df['team2_total'] = match_df['team2_total'].fillna(match_df['team2_total'].mean())
50
+ match_df.drop(columns=['batting_team', 'match_id'], errors='ignore', inplace=True)
51
+
52
+ # Add venue and city indices
53
+ match_df['venue_index'] = match_df['venue'].astype('category').cat.codes
54
+ match_df['city_index'] = match_df['city'].astype('category').cat.codes
55
+
56
+ # Add toss features
57
+ match_df['toss_winner_index'] = match_df['toss_winner'].astype('category').cat.codes
58
+ match_df['toss_decision_index'] = match_df['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
59
+
60
+ # Compute historical win rates
61
+ match_df['date_numeric'] = (match_df['date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1d')
62
+ max_date = match_df['date_numeric'].max()
63
+ team1_wins = match_df[match_df['winner'] == match_df['team1']].groupby('team1').agg({'date_numeric': 'mean', 'id': 'count'}).reset_index()
64
+ team1_wins.rename(columns={'id': 'wins', 'date_numeric': 'win_date', 'team1': 'team'}, inplace=True)
65
+ team2_wins = match_df[match_df['winner'] == match_df['team2']].groupby('team2').agg({'date_numeric': 'mean', 'id': 'count'}).reset_index()
66
+ team2_wins.rename(columns={'id': 'wins', 'date_numeric': 'win_date', 'team2': 'team'}, inplace=True)
67
+ team_wins = pd.concat([team1_wins, team2_wins]).groupby('team').agg({'wins': 'sum', 'win_date': 'mean'}).reset_index()
68
+ team1_matches = match_df.groupby('team1').size().reset_index(name='matches')
69
+ team1_matches.rename(columns={'team1': 'team'}, inplace=True)
70
+ team2_matches = match_df.groupby('team2').size().reset_index(name='matches')
71
+ team2_matches.rename(columns={'team2': 'team'}, inplace=True)
72
+ team_matches = pd.concat([team1_matches, team2_matches]).groupby('team')['matches'].sum().reset_index()
73
+ team_win_rates = team_matches.merge(team_wins, on='team', how='left').fillna(0)
74
+ team_win_rates['weighted_wins'] = team_win_rates.apply(lambda x: x['wins'] * np.exp(-0.1 * (max_date - x['win_date']) / 365) if pd.notna(x['win_date']) else 0, axis=1)
75
+ team_win_rates['win_rate'] = team_win_rates['weighted_wins'] / team_win_rates['matches']
76
+ team_win_rates['win_rate'] = team_win_rates['win_rate'].fillna(0)
77
+ match_df = match_df.merge(team_win_rates[['team', 'win_rate']].rename(columns={'team': 'team1', 'win_rate': 'team1_win_rate'}), on='team1', how='left')
78
+ match_df = match_df.merge(team_win_rates[['team', 'win_rate']].rename(columns={'team': 'team2', 'win_rate': 'team2_win_rate'}), on='team2', how='left')
79
+
80
+ # Compute head-to-head win rates
81
+ head_to_head = match_df[match_df['team1'].isin(match_df['team1'].unique()) & match_df['team2'].isin(match_df['team2'].unique())]
82
+ head_to_head_wins = head_to_head[head_to_head['winner'] == head_to_head['team1']].groupby(['team1', 'team2']).size().reset_index(name='h2h_wins')
83
+ head_to_head_matches = head_to_head.groupby(['team1', 'team2']).size().reset_index(name='h2h_matches')
84
+ h2h_win_rates = head_to_head_matches.merge(head_to_head_wins, on=['team1', 'team2'], how='left').fillna(0)
85
+ h2h_win_rates = h2h_win_rates[head_to_head_matches['h2h_matches'] >= 1]
86
+ h2h_win_rates['h2h_win_rate'] = h2h_win_rates['h2h_wins'] / h2h_win_rates['h2h_matches']
87
+ match_df = match_df.merge(h2h_win_rates[['team1', 'team2', 'h2h_win_rate']], on=['team1', 'team2'], how='left').fillna(0)
88
+
89
+ # Cap outliers
90
+ match_df['team1_total'] = match_df['team1_total'].clip(upper=500)
91
+ match_df['team2_total'] = match_df['team2_total'].clip(upper=500)
92
+
93
+ return match_df, ball_df
94
+
95
+ # Train team performance model and return it
96
+ def train_team_performance_model(match_df):
97
+ data = match_df[['team1', 'team2', 'winner', 'team1_total', 'team2_total', 'venue_index', 'city_index',
98
+ 'toss_winner_index', 'toss_decision_index', 'dl_applied', 'team1_win_rate',
99
+ 'team2_win_rate', 'h2h_win_rate']].dropna()
100
+
101
+ # Convert categorical teams to numerical indices
102
+ data['team1_index'] = data['team1'].astype('category').cat.codes
103
+ data['team2_index'] = data['team2'].astype('category').cat.codes
104
+ data['winner_index'] = (data['winner'] == data['team1']).astype(int)
105
 
106
+ # Features and targets
107
+ X = pd.DataFrame()
108
+ X['team1_index'] = data['team1_index']
109
+ X['team2_index'] = data['team2_index']
110
+ X['venue_index'] = data['venue_index']
111
+ X['city_index'] = data['city_index']
112
+ X['toss_winner_index'] = data['toss_winner_index']
113
+ X['toss_decision_index'] = data['toss_decision_index']
114
+ X['dl_applied'] = data['dl_applied']
115
+ X['team1_win_rate'] = data['team1_win_rate']
116
+ X['team2_win_rate'] = data['team2_win_rate']
117
+ X['h2h_win_rate'] = data['h2h_win_rate'] * 2
118
 
119
+ y_win = data['winner_index']
120
+ y_score = data[['team1_total', 'team2_total']]
 
121
 
122
+ # Scale features
123
+ scaler = StandardScaler()
124
+ scaled_features = scaler.fit_transform(X[['venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index',
125
+ 'dl_applied', 'team1_win_rate', 'team2_win_rate', 'h2h_win_rate']])
126
+ X_scaled = pd.DataFrame(scaled_features, columns=['venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index',
127
+ 'dl_applied', 'team1_win_rate', 'team2_win_rate', 'h2h_win_rate'])
128
+ X_scaled['team1_index'] = X['team1_index']
129
+ X_scaled['team2_index'] = X['team2_index']
130
+
131
+ # Train/test split
132
+ X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_win, test_size=0.2, random_state=42)
133
 
134
+ # Train win model
135
+ win_model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, class_weight='balanced')
136
+ win_model.fit(X_train, y_train)
137
 
138
+ # Train score model
139
+ base_score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
140
+ score_model = MultiOutputRegressor(base_score_model)
141
+ score_model.fit(X_scaled, y_score)
142
 
143
+ return win_model, score_model, data, scaler
 
144
 
145
+ # Train player score model and return it
146
+ def train_player_score_model(match_df, ball_df):
147
+ player_runs = ball_df.groupby(['match_id', 'striker', 'batting_team'])['runs_off_bat'].sum().reset_index()
148
  player_runs.rename(columns={'runs_off_bat': 'player_total'}, inplace=True)
149
  player_data = player_runs.merge(match_df, left_on='match_id', right_on='id', how='left')
150
 
151
+ # Feature engineering
152
  player_data['player_avg'] = player_data.groupby('striker')['player_total'].transform('mean')
153
+ player_data['team_win_rate'] = player_data.apply(lambda x: player_data[player_data['team1'] == x['batting_team']]['team1_win_rate'].mean()
154
+ if x['batting_team'] == x['team1'] else player_data[player_data['team2'] == x['batting_team']]['team2_win_rate'].mean(), axis=1)
155
  player_data['venue_index'] = player_data['venue'].astype('category').cat.codes
156
  player_data['city_index'] = player_data['city'].astype('category').cat.codes
157
  player_data['toss_winner_index'] = player_data['toss_winner'].astype('category').cat.codes
158
  player_data['toss_decision_index'] = player_data['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
159
 
160
+ # Features and target
161
+ X = player_data[['player_avg', 'team_win_rate', 'venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index']].dropna()
162
  y = player_data.loc[X.index, 'player_total']
163
 
164
  # Scale features
165
  scaler = StandardScaler()
166
  X_scaled = scaler.fit_transform(X)
167
 
168
+ # Train model
169
+ score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
170
+ score_model.fit(X_scaled, y)
171
+
172
+ return score_model, scaler, player_data
173
+
174
+ # Prediction functions (unchanged except removing joblib.load)
175
+ def predict_player_score(player, team, opponent, venue=None, city=None, toss_winner=None, toss_decision=None,
176
+ score_model=None, scaler=None, player_data=None):
177
+ try:
178
+ if player not in player_data['striker'].values or team not in player_data['batting_team'].values:
179
+ raise ValueError("Player or team not found in training data")
180
+
181
+ player_avg = player_data[player_data['striker'] == player]['player_total'].mean()
182
+ team_win_rate = player_data[player_data['batting_team'] == team]['team_win_rate'].mean()
183
+ venue_index = player_data[player_data['venue'] == venue]['venue_index'].values[0] if venue else player_data['venue_index'].mean()
184
+ city_index = player_data[player_data['city'] == city]['city_index'].values[0] if city else player_data['city_index'].mean()
185
+ toss_winner_index = player_data[player_data['toss_winner'] == toss_winner]['toss_winner_index'].values[0] if toss_winner else player_data['toss_winner_index'].mean()
186
+ toss_decision_index = 1 if toss_decision == 'bat' else 0 if toss_decision == 'field' else player_data['toss_decision_index'].mean()
187
+
188
+ features = scaler.transform([[player_avg, team_win_rate, venue_index, city_index, toss_winner_index, toss_decision_index]])
189
+ predicted_score = score_model.predict(features)[0]
190
+
191
+ return {
192
+ "player": player,
193
+ "team": team,
194
+ "opponent": opponent,
195
+ "expected_score": round(predicted_score, 2)
196
+ }
197
+ except Exception as e:
198
+ print(f"Prediction error: {str(e)}")
199
+ return {
200
+ "player": player,
201
+ "team": team,
202
+ "opponent": opponent,
203
+ "expected_score": 0.0
204
+ }
205
+
206
+ def predict_team_performance(team1, team2, venue=None, city=None, toss_winner=None, toss_decision=None,
207
+ win_model=None, score_model=None, data=None, scaler=None):
208
+ try:
209
+ if team1 not in data['team1'].values or team2 not in data['team2'].values:
210
+ raise ValueError("Team not found in training data")
211
+
212
+ team1_index = data[data['team1'] == team1]['team1_index'].values[0]
213
+ team2_index = data[data['team2'] == team2]['team2_index'].values[0]
214
+ venue_index = data[data['venue'] == venue]['venue_index'].values[0] if venue else data['venue_index'].mean()
215
+ city_index = data[data['city'] == city]['city_index'].values[0] if city else data['city_index'].mean()
216
+ toss_winner_index = data[data['toss_winner'] == toss_winner]['toss_winner_index'].values[0] if toss_winner else data['toss_winner_index'].mean()
217
+ toss_decision_index = 1 if toss_decision == 'bat' else 0 if toss_decision == 'field' else data['toss_decision_index'].mean()
218
+ dl_applied = 0 if pd.isna(toss_decision) else data['dl_applied'].mean()
219
+ team1_win_rate = data[data['team1'] == team1]['team1_win_rate'].values[0]
220
+ team2_win_rate = data[data['team2'] == team2]['team2_win_rate'].values[0]
221
+ h2h_win_rate = data[(data['team1'] == team1) & (data['team2'] == team2)]['h2h_win_rate'].values[0] if not data[(data['team1'] == team1) & (data['team2'] == team2)].empty else 0
222
+
223
+ features = scaler.transform([[venue_index, city_index, toss_winner_index, toss_decision_index, dl_applied,
224
+ team1_win_rate, team2_win_rate, h2h_win_rate]])
225
+ win_probability = win_model.predict_proba([[team1_index, team2_index, features[0][0], features[0][1],
226
+ features[0][2], features[0][3], features[0][4], features[0][5],
227
+ features[0][6], features[0][7]]])[:, 1][0] * 100
228
+ predicted_scores = score_model.predict([[team1_index, team2_index, features[0][0], features[0][1],
229
+ features[0][2], features[0][3], features[0][4], features[0][5],
230
+ features[0][6], features[0][7]]])[0]
231
+
232
+ return {
233
+ "team1": team1,
234
+ "team2": team2,
235
+ "win_probability_team1": round(win_probability, 2),
236
+ "expected_team1_score": round(predicted_scores[0], 2),
237
+ "expected_team2_score": round(predicted_scores[1], 2)
238
+ }
239
+ except Exception as e:
240
+ print(f"Prediction error: {str(e)}")
241
+ return {
242
+ "team1": team1,
243
+ "team2": team2,
244
+ "win_probability_team1": 50.0,
245
+ "expected_team1_score": 0.0,
246
+ "expected_team2_score": 0.0
247
+ }
services.py CHANGED
@@ -1,37 +1,80 @@
1
- import os
2
  import pandas as pd
3
  import numpy as np
4
  from fastapi import HTTPException
5
- from models.train_model import train_player_score_model, train_team_performance_model # No .pkl files needed!
6
- from dotenv import load_dotenv
 
 
7
  from groq import Groq
8
 
9
- # Load environment variables for security
10
- load_dotenv()
 
 
 
 
 
 
 
 
11
 
12
- # 🔹 Secure API Key Storage (Avoid Hardcoding API Keys)
13
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
14
- if not GROQ_API_KEY:
15
- raise ValueError("Missing GROQ API key. Set it in environment variables.")
16
  client = Groq(api_key=GROQ_API_KEY)
17
 
18
- # 🔹 Load datasets
19
- match_df = pd.read_csv('data/cleaned_match_data.csv')
20
- ball_df = pd.read_csv('data/cleaned_ball_data.csv', low_memory=False)
 
 
 
 
 
 
 
 
 
21
 
22
- # 🔹 Train models dynamically (No `.pkl` files!)
23
- player_score_model, player_scaler = train_player_score_model()
24
- team_win_model, team_score_model = train_team_performance_model()
25
 
26
- # 🔹 LLM Summary Generation (Groq AI)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def generate_summary(data, context_type):
28
- prompt = f"Summarize this {context_type} data in one sentence: {data}"
29
-
 
 
 
 
 
 
 
 
 
 
30
  try:
31
  chat_completion = client.chat.completions.create(
32
  model="mixtral-8x7b-32768",
33
- messages=[{"role": "system", "content": "You are a concise cricket analyst."},
34
- {"role": "user", "content": prompt}],
 
 
35
  max_tokens=50,
36
  temperature=0.7
37
  )
@@ -39,61 +82,190 @@ def generate_summary(data, context_type):
39
  except Exception as e:
40
  return f"Summary unavailable due to error: {str(e)}"
41
 
42
- # 🔹 Predict Player Score (No `.pkl` file needed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def predict_score(player_name: str, opposition_team: str):
44
  try:
45
- input_features = np.array([[50, 1, 2, 3, 1]]) # Example feature vector
46
- input_features = player_scaler.transform(input_features)
47
- predicted_runs = player_score_model.predict(input_features)[0]
 
 
 
 
 
 
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  stats = {
50
  "player": player_name,
 
51
  "opposition": opposition_team,
52
- "predicted_runs": round(predicted_runs, 2),
53
- "summary": generate_summary(predicted_runs, "prediction_score")
54
  }
55
- return stats
 
56
  except Exception as e:
57
- raise HTTPException(status_code=500, detail=f"Error predicting score: {str(e)}")
58
 
59
- # 🔹 Predict Team Outcome (No `.pkl` file needed)
60
  def predict_team_outcome(team1: str, team2: str):
61
- try:
62
- input_features = np.array([[1, 2]]) # Example feature vector
63
- win_probability = team_win_model.predict_proba(input_features)[:, 1][0] * 100
64
- predicted_scores = team_score_model.predict(input_features)[0]
65
-
66
- return {
67
- "team1": team1,
68
- "team2": team2,
69
- "win_probability_team1": round(win_probability, 2),
70
- "expected_team1_score": round(predicted_scores[0], 2),
71
- "expected_team2_score": round(predicted_scores[1], 2),
72
- "summary": generate_summary(win_probability, "prediction_team")
73
- }
74
- except Exception as e:
75
- raise HTTPException(status_code=500, detail=f"Error predicting team outcome: {str(e)}")
76
 
77
- # 🔹 Utility Functions
78
  def get_teams():
79
- return {"teams": sorted(set(match_df['team1'].unique().tolist() + match_df['team2'].unique().tolist()))}
80
 
81
  def get_players():
82
- return {"players": sorted(set(ball_df['striker'].dropna().unique().tolist()))}
 
83
 
84
  def get_seasons():
85
- return {"seasons": ["All Seasons"] + sorted(match_df['season'].dropna().unique().tolist())}
86
 
87
- # 🔹 Get Team Trends Over Time
88
  def get_team_trends(team_name: str):
89
  team_name = team_name.strip().title()
90
- team_matches = match_df[(match_df['team1'] == team_name) | (match_df['team2'] == team_name)]
91
  if team_matches.empty:
92
  raise HTTPException(status_code=404, detail="Team not found")
93
 
94
- # Calculate win percentage and other metrics for each season
95
  trends = []
96
- for season in match_df['season'].unique():
97
  season_matches = team_matches[team_matches['season'] == season]
98
  if not season_matches.empty:
99
  wins = season_matches[season_matches['winner'] == team_name].shape[0]
@@ -103,22 +275,21 @@ def get_team_trends(team_name: str):
103
  "season": season,
104
  "wins": wins,
105
  "total_matches": total_matches,
106
- "win_percentage": round(win_percentage, 2)
107
  })
108
 
109
- return {"team_name": team_name, "trends": trends, "summary": generate_summary(trends, "team_trends")}
110
 
111
- # 🔹 Get Player Trends Over Time
112
  def get_player_trends(player_name: str, role: str = "Batting"):
113
  player_name = player_name.strip().title()
114
  name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
115
- player_data = ball_df[ball_df['striker'].isin(name_variations) | ball_df['bowler'].isin(name_variations)]
116
  if player_data.empty:
117
  raise HTTPException(status_code=404, detail=f"Player '{player_name}' not found")
118
 
119
- # Calculate performance metrics for each season
120
  trends = []
121
- for season in ball_df['season'].unique():
122
  season_data = player_data[player_data['season'] == season]
123
  if not season_data.empty:
124
  if role == "Batting":
@@ -129,7 +300,7 @@ def get_player_trends(player_name: str, role: str = "Batting"):
129
  trends.append({
130
  "season": season,
131
  "total_runs": total_runs,
132
- "strike_rate": round(strike_rate, 2),
133
  "matches_played": matches_played
134
  })
135
  elif role == "Bowling":
@@ -142,9 +313,9 @@ def get_player_trends(player_name: str, role: str = "Batting"):
142
  trends.append({
143
  "season": season,
144
  "total_wickets": total_wickets,
145
- "bowling_average": round(bowling_average, 2),
146
- "economy_rate": round(economy_rate, 2),
147
  "matches_played": matches_played
148
  })
149
 
150
- return {"player_name": player_name, "role": role, "trends": trends, "summary": generate_summary(trends, "player_trends")}
 
 
1
  import pandas as pd
2
  import numpy as np
3
  from fastapi import HTTPException
4
+ from models.train_model import (
5
+ load_and_preprocess_data, train_team_performance_model, train_player_score_model,
6
+ predict_player_score, predict_team_performance
7
+ )
8
  from groq import Groq
9
 
10
+ # Global variables to store models and data
11
+ TEAM_WIN_MODEL = None
12
+ TEAM_SCORE_MODEL = None
13
+ TEAM_DATA = None
14
+ TEAM_SCALER = None
15
+ PLAYER_SCORE_MODEL = None
16
+ PLAYER_SCALER = None
17
+ PLAYER_DATA = None
18
+ MATCH_DF = None
19
+ BALL_DF = None
20
 
21
+ # Initialize Groq client
22
+ GROQ_API_KEY = "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"
 
 
23
  client = Groq(api_key=GROQ_API_KEY)
24
 
25
+ # Load data and train models at startup
26
+ def initialize_models():
27
+ global TEAM_WIN_MODEL, TEAM_SCORE_MODEL, TEAM_DATA, TEAM_SCALER
28
+ global PLAYER_SCORE_MODEL, PLAYER_SCALER, PLAYER_DATA, MATCH_DF, BALL_DF
29
+
30
+ MATCH_DF, BALL_DF = load_and_preprocess_data()
31
+ TEAM_WIN_MODEL, TEAM_SCORE_MODEL, TEAM_DATA, TEAM_SCALER = train_team_performance_model(MATCH_DF)
32
+ PLAYER_SCORE_MODEL, PLAYER_SCALER, PLAYER_DATA = train_player_score_model(MATCH_DF, BALL_DF)
33
+ print("Models trained and loaded into memory.")
34
+
35
+ # Call this at app startup (see main.py below)
36
+ initialize_models()
37
 
38
+ # Player-team mapping
39
+ player_team_mapping = BALL_DF.groupby('striker')['batting_team'].agg(lambda x: x.mode()[0] if len(x.mode()) > 0 else None).to_dict()
 
40
 
41
+ # Clean JSON data (unchanged)
42
+ def clean_json(data):
43
+ if isinstance(data, dict):
44
+ return {k: clean_json(v) for k, v in data.items()}
45
+ elif isinstance(data, list):
46
+ return [clean_json(v) for v in data]
47
+ elif isinstance(data, float):
48
+ return 0.0 if pd.isna(data) or np.isinf(data) else data
49
+ elif pd.isna(data):
50
+ return None
51
+ elif isinstance(data, pd.Timestamp):
52
+ return data.strftime('%Y-%m-%d') if pd.notna(data) else None
53
+ elif isinstance(data, (int, bool)):
54
+ return data
55
+ return str(data)
56
+
57
+ # Summary generation (unchanged)
58
  def generate_summary(data, context_type):
59
+ prompt = ""
60
+ if context_type == "player_stats":
61
+ prompt = f"Summarize this player data in one sentence: {data}"
62
+ elif context_type == "team_stats":
63
+ prompt = f"Summarize this team data in one sentence: {data}"
64
+ elif context_type == "match_history":
65
+ prompt = f"Summarize this match history between {data['team1']} and {data['team2']} in one sentence: {data['matches']}"
66
+ elif context_type == "prediction_score":
67
+ prompt = f"Summarize this prediction in one sentence: {data}"
68
+ elif context_type == "prediction_team":
69
+ prompt = f"Summarize this team prediction in one sentence: {data}"
70
+
71
  try:
72
  chat_completion = client.chat.completions.create(
73
  model="mixtral-8x7b-32768",
74
+ messages=[
75
+ {"role": "system", "content": "You are a concise cricket analyst."},
76
+ {"role": "user", "content": prompt}
77
+ ],
78
  max_tokens=50,
79
  temperature=0.7
80
  )
 
82
  except Exception as e:
83
  return f"Summary unavailable due to error: {str(e)}"
84
 
85
+ # Player stats (unchanged except using global BALL_DF)
86
+ def get_player_stats(player_name: str, season: str = None, role: str = "Batting"):
87
+ player_name = player_name.strip().title()
88
+ name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
89
+ player_data = BALL_DF[BALL_DF['striker'].isin(name_variations) | BALL_DF['bowler'].isin(name_variations)]
90
+ if season and 'season' in BALL_DF.columns:
91
+ player_data = player_data[player_data['season'] == season]
92
+ if player_data.empty:
93
+ raise HTTPException(status_code=404, detail=f"Player '{player_name}' not found. Variations tried: {name_variations}")
94
+
95
+ if role == "Batting":
96
+ batting_data = player_data[player_data['striker'].isin(name_variations)]
97
+ total_runs = int(batting_data['runs_off_bat'].sum())
98
+ balls_faced = int(batting_data.shape[0])
99
+ strike_rate = float((total_runs / balls_faced * 100) if balls_faced > 0 else 0)
100
+ matches_played = int(len(batting_data['match_id'].unique()))
101
+
102
+ stats = {
103
+ "player_name": player_name,
104
+ "role": role,
105
+ "total_runs": total_runs,
106
+ "balls_faced": balls_faced,
107
+ "strike_rate": strike_rate,
108
+ "matches_played": matches_played,
109
+ "season": season if season else "All Seasons"
110
+ }
111
+ stats["summary"] = generate_summary(stats, "player_stats")
112
+ return clean_json(stats)
113
+
114
+ elif role == "Bowling":
115
+ bowling_data = player_data[player_data['bowler'].isin(name_variations)]
116
+ bowler_wicket_types = ["caught", "bowled", "lbw", "caught and bowled", "hit wicket"]
117
+ wickets_data = bowling_data[bowling_data['player_dismissed'].notna() &
118
+ bowling_data['wicket_type'].isin(bowler_wicket_types)]
119
+ total_wickets = int(wickets_data.shape[0])
120
+ total_runs_conceded = int(bowling_data['total_runs'].sum())
121
+ total_balls_bowled = int(bowling_data.shape[0])
122
+ total_overs_bowled = float(total_balls_bowled / 6)
123
+ bowling_average = float(total_runs_conceded / total_wickets) if total_wickets > 0 else float('inf')
124
+ economy_rate = float(total_runs_conceded / total_overs_bowled) if total_overs_bowled > 0 else 0
125
+ bowling_strike_rate = float(total_balls_bowled / total_wickets) if total_wickets > 0 else float('inf')
126
+ bowling_matches = int(len(bowling_data['match_id'].unique()))
127
+
128
+ stats = {
129
+ "player_name": player_name,
130
+ "role": role,
131
+ "total_wickets": total_wickets,
132
+ "bowling_average": 0.0 if np.isinf(bowling_average) else round(bowling_average, 2),
133
+ "economy_rate": round(economy_rate, 2),
134
+ "bowling_strike_rate": 0.0 if np.isinf(bowling_strike_rate) else round(bowling_strike_rate, 2),
135
+ "overs_bowled": round(total_overs_bowled, 1),
136
+ "bowling_matches": bowling_matches,
137
+ "season": season if season else "All Seasons"
138
+ }
139
+ stats["summary"] = generate_summary(stats, "player_stats")
140
+ return clean_json(stats)
141
+
142
+ # Team stats (unchanged except using global MATCH_DF)
143
+ def get_team_stats(team_name: str, season: str = None):
144
+ team_name = team_name.strip().title()
145
+ team_matches = MATCH_DF[(MATCH_DF['team1'] == team_name) | (MATCH_DF['team2'] == team_name)]
146
+ if season and 'season' in MATCH_DF.columns:
147
+ team_matches = team_matches[team_matches['season'] == season]
148
+ if team_matches.empty:
149
+ raise HTTPException(status_code=404, detail="Team not found")
150
+
151
+ wins = int(team_matches[team_matches['winner'] == team_name].shape[0])
152
+ total_matches = int(team_matches.shape[0])
153
+
154
+ stats = {
155
+ "total_matches": total_matches,
156
+ "wins": wins,
157
+ "losses": total_matches - wins,
158
+ "win_percentage": float((wins / total_matches * 100) if total_matches > 0 else 0),
159
+ "season": season if season else "All Seasons"
160
+ }
161
+ stats["summary"] = generate_summary(stats, "team_stats")
162
+ return clean_json(stats)
163
+
164
+ # Match history (unchanged except using global MATCH_DF)
165
+ def get_match_history(team1: str, team2: str, season: str = None):
166
+ team1 = team1.strip().title()
167
+ team2 = team2.strip().title()
168
+ available_teams = set(MATCH_DF['team1'].unique().tolist() + MATCH_DF['team2'].unique().tolist())
169
+ if team1 not in available_teams or team2 not in available_teams:
170
+ raise HTTPException(status_code=404, detail=f"Team {team1 if team1 not in available_teams else team2} not found.")
171
+
172
+ team_matches = MATCH_DF[
173
+ ((MATCH_DF['team1'] == team1) & (MATCH_DF['team2'] == team2)) |
174
+ ((MATCH_DF['team1'] == team2) & (MATCH_DF['team2'] == team1))
175
+ ].copy()
176
+ if season and 'season' in MATCH_DF.columns:
177
+ team_matches = team_matches[team_matches['season'] == season]
178
+ if team_matches.empty:
179
+ raise HTTPException(status_code=404, detail=f"No match history found between {team1} and {team2}.")
180
+
181
+ team_matches['date'] = team_matches['date'].apply(lambda x: x.strftime('%Y-%m-%d') if pd.notna(x) else None)
182
+ team_matches['winner'] = team_matches['winner'].fillna("Draw")
183
+ for column in ['team1', 'team2', 'winner']:
184
+ team_matches[column] = team_matches[column].apply(lambda x: str(x) if pd.notna(x) else None)
185
+ history = team_matches[['date', 'team1', 'team2', 'winner']].to_dict(orient='records')
186
+
187
+ response = {
188
+ "team1": team1,
189
+ "team2": team2,
190
+ "season": season if season else "All Seasons",
191
+ "matches": history
192
+ }
193
+ response["summary"] = generate_summary(response, "match_history")
194
+ return clean_json(response)
195
+
196
+ # Prediction functions using in-memory models
197
  def predict_score(player_name: str, opposition_team: str):
198
  try:
199
+ player_name = player_name.strip().replace("+", " ").title()
200
+ name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
201
+ player_team = None
202
+ for name in name_variations:
203
+ if name in player_team_mapping:
204
+ player_team = player_team_mapping[name]
205
+ player_name = name
206
+ break
207
+ if not player_team:
208
+ raise ValueError(f"Player {player_name} not found in historical data")
209
 
210
+ predicted_runs = predict_player_score(
211
+ player=player_name,
212
+ team=player_team,
213
+ opponent=opposition_team,
214
+ venue=None,
215
+ city=None,
216
+ toss_winner=None,
217
+ toss_decision=None,
218
+ score_model=PLAYER_SCORE_MODEL,
219
+ scaler=PLAYER_SCALER,
220
+ player_data=PLAYER_DATA
221
+ )
222
  stats = {
223
  "player": player_name,
224
+ "team": player_team,
225
  "opposition": opposition_team,
226
+ "predicted_runs": predicted_runs["expected_score"]
 
227
  }
228
+ stats["summary"] = generate_summary(stats, "prediction_score")
229
+ return clean_json(stats)
230
  except Exception as e:
231
+ raise HTTPException(status_code=500, detail=f"Error predicting score for {player_name} against {opposition_team}: {str(e)}")
232
 
 
233
  def predict_team_outcome(team1: str, team2: str):
234
+ prediction = predict_team_performance(
235
+ team1=team1,
236
+ team2=team2,
237
+ venue=None,
238
+ city=None,
239
+ toss_winner=None,
240
+ toss_decision=None,
241
+ win_model=TEAM_WIN_MODEL,
242
+ score_model=TEAM_SCORE_MODEL,
243
+ data=TEAM_DATA,
244
+ scaler=TEAM_SCALER
245
+ )
246
+ prediction["summary"] = generate_summary(prediction, "prediction_team")
247
+ return clean_json(prediction)
 
248
 
249
+ # Utility functions (unchanged except using global dataframes)
250
  def get_teams():
251
+ return clean_json({"teams": sorted(set(MATCH_DF['team1'].unique().tolist() + MATCH_DF['team2'].unique().tolist()))})
252
 
253
  def get_players():
254
+ unique_players = sorted(set(BALL_DF['striker'].dropna().unique().tolist()))
255
+ return clean_json({"players": unique_players})
256
 
257
  def get_seasons():
258
+ return clean_json({"seasons": ["All Seasons"] + sorted(MATCH_DF['season'].dropna().unique().tolist())})
259
 
260
+ # Team trends (unchanged except using global MATCH_DF)
261
  def get_team_trends(team_name: str):
262
  team_name = team_name.strip().title()
263
+ team_matches = MATCH_DF[(MATCH_DF['team1'] == team_name) | (MATCH_DF['team2'] == team_name)]
264
  if team_matches.empty:
265
  raise HTTPException(status_code=404, detail="Team not found")
266
 
 
267
  trends = []
268
+ for season in MATCH_DF['season'].unique():
269
  season_matches = team_matches[team_matches['season'] == season]
270
  if not season_matches.empty:
271
  wins = season_matches[season_matches['winner'] == team_name].shape[0]
 
275
  "season": season,
276
  "wins": wins,
277
  "total_matches": total_matches,
278
+ "win_percentage": win_percentage
279
  })
280
 
281
+ return {"team_name": team_name, "trends": trends}
282
 
283
+ # Player trends (unchanged except using global BALL_DF)
284
  def get_player_trends(player_name: str, role: str = "Batting"):
285
  player_name = player_name.strip().title()
286
  name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
287
+ player_data = BALL_DF[BALL_DF['striker'].isin(name_variations) | BALL_DF['bowler'].isin(name_variations)]
288
  if player_data.empty:
289
  raise HTTPException(status_code=404, detail=f"Player '{player_name}' not found")
290
 
 
291
  trends = []
292
+ for season in BALL_DF['season'].unique():
293
  season_data = player_data[player_data['season'] == season]
294
  if not season_data.empty:
295
  if role == "Batting":
 
300
  trends.append({
301
  "season": season,
302
  "total_runs": total_runs,
303
+ "strike_rate": strike_rate,
304
  "matches_played": matches_played
305
  })
306
  elif role == "Bowling":
 
313
  trends.append({
314
  "season": season,
315
  "total_wickets": total_wickets,
316
+ "bowling_average": bowling_average,
317
+ "economy economy_rate": economy_rate,
318
  "matches_played": matches_played
319
  })
320
 
321
+ return {"player_name": player_name, "role": role, "trends": trends}