sivapriya175 commited on
Commit
ee329f7
·
1 Parent(s): 8bf5b2b

deploy backend files

Browse files
.env ADDED
File without changes
__pycache__/services.cpython-313.pyc CHANGED
Binary files a/__pycache__/services.cpython-313.pyc and b/__pycache__/services.cpython-313.pyc differ
 
models/__pycache__/train_model.cpython-313.pyc CHANGED
Binary files a/models/__pycache__/train_model.cpython-313.pyc and b/models/__pycache__/train_model.cpython-313.pyc differ
 
models/train_model.py CHANGED
@@ -5,291 +5,65 @@ from sklearn.multioutput import MultiOutputRegressor
5
  from sklearn.model_selection import train_test_split
6
  from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
7
  from sklearn.preprocessing import StandardScaler
8
- import joblib
9
 
10
- # Load datasets with exact column names
11
- ball_df = pd.read_csv('data/cleaned_ball_data.csv',
12
- dtype={
13
- 'match_id': str, 'season': str, 'start_date': str, 'venue': str,
14
- 'innings': int, 'ball': float, 'batting_team': str, 'bowling_team': str,
15
- 'striker': str, 'non_striker': str, 'bowler': str, 'runs_off_bat': int,
16
- 'extras': int, 'wides': float, 'noballs': float, 'byes': float,
17
- 'legbyes': float, 'penalty': float, 'wicket_type': str,
18
- 'player_dismissed': str, 'other_wicket_type': str,
19
- 'other_player_dismissed': str, 'cricsheet_id': str, 'total_runs': int
20
- })
21
- match_df = pd.read_csv('data/cleaned_match_data.csv',
22
- dtype={
23
- 'id': str, 'season': str, 'city': str, 'date': str,
24
- 'team1': str, 'team2': str, 'toss_winner': str, 'toss_decision': str,
25
- 'result': str, 'dl_applied': int, 'winner': str,
26
- 'win_by_runs': float, 'win_by_wickets': float, 'player_of_match': str,
27
- 'venue': str, 'umpire1': str, 'umpire2': str, 'umpire3': str
28
- })
29
 
30
- # Convert date columns to datetime
31
  match_df['date'] = pd.to_datetime(match_df['date'], errors='coerce')
32
  ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce')
33
 
34
- # Filter for ODI matches (proxy based on date range; adjust as needed)
35
- odi_date_mask = (match_df['date'].dt.year >= 2015) & (match_df['date'].dt.year <= 2022)
36
- match_df = match_df[odi_date_mask].copy()
37
-
38
- # Compute team total scores from ball_df
39
- team_scores = ball_df.groupby(['match_id', 'batting_team'])['total_runs'].sum().reset_index()
40
- team_scores.rename(columns={'total_runs': 'team_total'}, inplace=True)
41
-
42
- # Merge computed scores into match_df with better handling for missing data
43
- match_df = match_df.merge(team_scores, left_on=['id', 'team1'], right_on=['match_id', 'batting_team'], how='left')
44
- match_df.rename(columns={'team_total': 'team1_total'}, inplace=True)
45
- match_df['team1_total'] = match_df['team1_total'].fillna(match_df['team1_total'].mean())
46
- match_df = match_df.merge(team_scores, left_on=['id', 'team2'], right_on=['match_id', 'batting_team'], how='left')
47
- match_df.rename(columns={'team_total': 'team2_total'}, inplace=True)
48
- match_df['team2_total'] = match_df['team2_total'].fillna(match_df['team2_total'].mean())
49
-
50
- # Drop extra columns created during merging
51
- match_df.drop(columns=['batting_team', 'match_id'], errors='ignore', inplace=True)
52
-
53
- # Add venue and city indices
54
- match_df['venue_index'] = match_df['venue'].astype('category').cat.codes
55
- match_df['city_index'] = match_df['city'].astype('category').cat.codes
56
-
57
- # Add toss features
58
- match_df['toss_winner_index'] = match_df['toss_winner'].astype('category').cat.codes
59
- match_df['toss_decision_index'] = match_df['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
60
-
61
- # Compute historical win rates for each team (weighted by recency)
62
- match_df['date_numeric'] = (match_df['date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1d')
63
- max_date = match_df['date_numeric'].max()
64
- team1_wins = match_df[match_df['winner'] == match_df['team1']].groupby('team1').agg({'date_numeric': 'mean', 'id': 'count'}).reset_index()
65
- team1_wins.rename(columns={'id': 'wins', 'date_numeric': 'win_date', 'team1': 'team'}, inplace=True)
66
- team2_wins = match_df[match_df['winner'] == match_df['team2']].groupby('team2').agg({'date_numeric': 'mean', 'id': 'count'}).reset_index()
67
- team2_wins.rename(columns={'id': 'wins', 'date_numeric': 'win_date', 'team2': 'team'}, inplace=True)
68
- team_wins = pd.concat([team1_wins, team2_wins]).groupby('team').agg({'wins': 'sum', 'win_date': 'mean'}).reset_index()
69
- team1_matches = match_df.groupby('team1').size().reset_index(name='matches')
70
- team1_matches.rename(columns={'team1': 'team'}, inplace=True)
71
- team2_matches = match_df.groupby('team2').size().reset_index(name='matches')
72
- team2_matches.rename(columns={'team2': 'team'}, inplace=True)
73
- team_matches = pd.concat([team1_matches, team2_matches]).groupby('team')['matches'].sum().reset_index()
74
- team_win_rates = team_matches.merge(team_wins, on='team', how='left').fillna(0)
75
- team_win_rates['weighted_wins'] = team_win_rates.apply(lambda x: x['wins'] * np.exp(-0.1 * (max_date - x['win_date']) / 365) if pd.notna(x['win_date']) else 0, axis=1)
76
- team_win_rates['win_rate'] = team_win_rates['weighted_wins'] / team_win_rates['matches']
77
- team_win_rates['win_rate'] = team_win_rates['win_rate'].fillna(0)
78
- match_df = match_df.merge(team_win_rates[['team', 'win_rate']].rename(columns={'team': 'team1', 'win_rate': 'team1_win_rate'}), on='team1', how='left')
79
- match_df = match_df.merge(team_win_rates[['team', 'win_rate']].rename(columns={'team': 'team2', 'win_rate': 'team2_win_rate'}), on='team2', how='left')
80
-
81
- # Compute head-to-head win rates with minimum match threshold
82
- head_to_head = match_df[match_df['team1'].isin(match_df['team1'].unique()) & match_df['team2'].isin(match_df['team2'].unique())]
83
- head_to_head_wins = head_to_head[head_to_head['winner'] == head_to_head['team1']].groupby(['team1', 'team2']).size().reset_index(name='h2h_wins')
84
- head_to_head_matches = head_to_head.groupby(['team1', 'team2']).size().reset_index(name='h2h_matches')
85
- h2h_win_rates = head_to_head_matches.merge(head_to_head_wins, on=['team1', 'team2'], how='left').fillna(0)
86
- h2h_win_rates = h2h_win_rates[head_to_head_matches['h2h_matches'] >= 1]
87
- h2h_win_rates['h2h_win_rate'] = h2h_win_rates['h2h_wins'] / h2h_win_rates['h2h_matches']
88
- match_df = match_df.merge(h2h_win_rates[['team1', 'team2', 'h2h_win_rate']], on=['team1', 'team2'], how='left').fillna(0)
89
-
90
- # Cap outliers in target variables
91
- match_df['team1_total'] = match_df['team1_total'].clip(upper=500)
92
- match_df['team2_total'] = match_df['team2_total'].clip(upper=500)
93
-
94
- # Train Team Performance Prediction Model
95
- def train_team_performance_model():
96
- data = match_df[['team1', 'team2', 'winner', 'team1_total', 'team2_total', 'venue_index', 'city_index',
97
- 'toss_winner_index', 'toss_decision_index', 'dl_applied', 'team1_win_rate',
98
- 'team2_win_rate', 'h2h_win_rate']].dropna()
99
-
100
- # Convert categorical teams to numerical indices
101
- data['team1_index'] = data['team1'].astype('category').cat.codes
102
- data['team2_index'] = data['team2'].astype('category').cat.codes
103
- data['winner_index'] = (data['winner'] == data['team1']).astype(int)
104
-
105
- # Features and targets
106
- X = pd.DataFrame()
107
- X['team1_index'] = data['team1_index']
108
- X['team2_index'] = data['team2_index']
109
- X['venue_index'] = data['venue_index']
110
- X['city_index'] = data['city_index']
111
- X['toss_winner_index'] = data['toss_winner_index']
112
- X['toss_decision_index'] = data['toss_decision_index']
113
- X['dl_applied'] = data['dl_applied']
114
- X['team1_win_rate'] = data['team1_win_rate']
115
- X['team2_win_rate'] = data['team2_win_rate']
116
- X['h2h_win_rate'] = data['h2h_win_rate'] * 2 # Double weight to head-to-head
117
-
118
- y_win = data['winner_index']
119
- y_score = data[['team1_total', 'team2_total']]
120
-
121
- # Scale numerical features
122
- scaler = StandardScaler()
123
- scaled_features = scaler.fit_transform(X[['venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index',
124
- 'dl_applied', 'team1_win_rate', 'team2_win_rate', 'h2h_win_rate']])
125
- X_scaled = pd.DataFrame(scaled_features, columns=['venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index',
126
- 'dl_applied', 'team1_win_rate', 'team2_win_rate', 'h2h_win_rate'])
127
- X_scaled['team1_index'] = X['team1_index']
128
- X_scaled['team2_index'] = X['team2_index']
129
-
130
- # Train/test split for win prediction
131
- X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_win, test_size=0.2, random_state=42)
132
-
133
- # Train RandomForestClassifier with tuned hyperparameters
134
- win_model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, class_weight='balanced')
135
- win_model.fit(X_train, y_train)
136
-
137
- # Evaluate
138
- y_pred = win_model.predict(X_test)
139
- accuracy = accuracy_score(y_test, y_pred)
140
- print(f"Team Win Model Accuracy: {accuracy}")
141
-
142
- # Train HistGradientBoostingRegressor with MultiOutputRegressor for score prediction
143
- base_score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
144
- score_model = MultiOutputRegressor(base_score_model)
145
- score_model.fit(X_scaled, y_score)
146
-
147
- # Evaluate score model
148
- y_score_pred = score_model.predict(X_scaled)
149
- mse = mean_squared_error(y_score, y_score_pred)
150
- r2 = r2_score(y_score, y_score_pred)
151
- print(f"Team Score Model MSE: {mse}, R²: {r2}")
152
-
153
- # Save models and scaler
154
- joblib.dump((win_model, score_model, data, scaler), 'models/team_performance_predictor.pkl')
155
-
156
- # Train Player Score Prediction Model
157
  def train_player_score_model():
158
- # Aggregate player runs per match from ball_df, including batting_team
159
- player_runs = ball_df.groupby(['match_id', 'striker', 'batting_team'])['runs_off_bat'].sum().reset_index()
160
  player_runs.rename(columns={'runs_off_bat': 'player_total'}, inplace=True)
161
-
162
- # Merge with match_df to get match context
163
  player_data = player_runs.merge(match_df, left_on='match_id', right_on='id', how='left')
164
 
165
- # Feature engineering for player performance
166
- player_data['player_avg'] = player_data.groupby('striker')['player_total'].transform('mean') # Average runs per player
167
- player_data['team_win_rate'] = player_data.apply(lambda x: player_data[player_data['team1'] == x['batting_team']]['team1_win_rate'].mean()
168
- if x['batting_team'] == x['team1'] else player_data[player_data['team2'] == x['batting_team']]['team2_win_rate'].mean(), axis=1)
169
  player_data['venue_index'] = player_data['venue'].astype('category').cat.codes
170
  player_data['city_index'] = player_data['city'].astype('category').cat.codes
171
  player_data['toss_winner_index'] = player_data['toss_winner'].astype('category').cat.codes
172
  player_data['toss_decision_index'] = player_data['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
173
 
174
- # Features and target
175
- X = player_data[['player_avg', 'team_win_rate', 'venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index']].dropna()
176
  y = player_data.loc[X.index, 'player_total']
177
 
178
- # Scale numerical features
179
  scaler = StandardScaler()
180
  X_scaled = scaler.fit_transform(X)
181
 
182
- # Train/test split
183
- X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
184
-
185
- # Train HistGradientBoostingRegressor
186
- score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
187
- score_model.fit(X_train, y_train)
188
-
189
- # Evaluate
190
- y_pred = score_model.predict(X_test)
191
- mse = mean_squared_error(y_test, y_pred)
192
- r2 = r2_score(y_test, y_pred)
193
- print(f"Player Score Model MSE: {mse}, R²: {r2}")
194
-
195
- # Save model and scaler
196
- joblib.dump((score_model, scaler, player_data), 'models/player_score_predictor.pkl')
197
-
198
- # Predict Player Score
199
- def predict_player_score(player: str, team: str, opponent: str, venue: str = None, city: str = None,
200
- toss_winner: str = None, toss_decision: str = None):
201
- try:
202
- score_model, scaler, player_data = joblib.load('models/player_score_predictor.pkl')
203
-
204
- if player not in player_data['striker'].values or team not in player_data['batting_team'].values:
205
- raise ValueError("Player or team not found in training data")
206
-
207
- # Compute player average from historical data
208
- player_avg = player_data[player_data['striker'] == player]['player_total'].mean()
209
- team_win_rate = player_data[player_data['batting_team'] == team]['team_win_rate'].mean()
210
-
211
- # Use specific values if provided, otherwise default to mean
212
- venue_index = player_data[player_data['venue'] == venue]['venue_index'].values[0] if venue else player_data['venue_index'].mean()
213
- city_index = player_data[player_data['city'] == city]['city_index'].values[0] if city else player_data['city_index'].mean()
214
- toss_winner_index = player_data[player_data['toss_winner'] == toss_winner]['toss_winner_index'].values[0] if toss_winner else player_data['toss_winner_index'].mean()
215
- toss_decision_index = 1 if toss_decision == 'bat' else 0 if toss_decision == 'field' else player_data['toss_decision_index'].mean()
216
-
217
- # Scale features
218
- features = scaler.transform([[player_avg, team_win_rate, venue_index, city_index, toss_winner_index, toss_decision_index]])
219
- predicted_score = score_model.predict(features)[0]
220
 
221
- return {
222
- "player": player,
223
- "team": team,
224
- "opponent": opponent,
225
- "expected_score": round(predicted_score, 2)
226
- }
227
- except Exception as e:
228
- print(f"Prediction error: {str(e)}")
229
- return {
230
- "player": player,
231
- "team": team,
232
- "opponent": opponent,
233
- "expected_score": 0.0
234
- }
235
 
236
- # Predict Team Win Percentage & Expected Score with debugging
237
- def predict_team_performance(team1: str, team2: str, venue: str = None, city: str = None,
238
- toss_winner: str = None, toss_decision: str = None):
239
- try:
240
- win_model, score_model, data, scaler = joblib.load('models/team_performance_predictor.pkl')
241
-
242
- if team1 not in data['team1'].values or team2 not in data['team2'].values:
243
- raise ValueError("Team not found in training data")
244
-
245
- # Get team indices
246
- team1_index = data[data['team1'] == team1]['team1_index'].values[0]
247
- team2_index = data[data['team2'] == team2]['team2_index'].values[0]
248
-
249
- # Use specific values if provided, otherwise default to mean
250
- venue_index = data[data['venue'] == venue]['venue_index'].values[0] if venue else data['venue_index'].mean()
251
- city_index = data[data['city'] == city]['city_index'].values[0] if city else data['city_index'].mean()
252
- toss_winner_index = data[data['toss_winner'] == toss_winner]['toss_winner_index'].values[0] if toss_winner else data['toss_winner_index'].mean()
253
- toss_decision_index = 1 if toss_decision == 'bat' else 0 if toss_decision == 'field' else data['toss_decision_index'].mean()
254
- dl_applied = 0 if pd.isna(toss_decision) else data['dl_applied'].mean()
255
- team1_win_rate = data[data['team1'] == team1]['team1_win_rate'].values[0]
256
- team2_win_rate = data[data['team2'] == team2]['team2_win_rate'].values[0]
257
- h2h_win_rate = data[(data['team1'] == team1) & (data['team2'] == team2)]['h2h_win_rate'].values[0] if not data[(data['team1'] == team1) & (data['team2'] == team2)].empty else 0
258
 
259
- # Debug head-to-head and win rates
260
- print(f"Team1: {team1}, Team2: {team2}, h2h_win_rate: {h2h_win_rate}, team1_win_rate: {team1_win_rate}, team2_win_rate: {team2_win_rate}")
 
 
261
 
262
- # Scale features
263
- features = scaler.transform([[venue_index, city_index, toss_winner_index, toss_decision_index, dl_applied,
264
- team1_win_rate, team2_win_rate, h2h_win_rate]])
265
- win_probability = win_model.predict_proba([[team1_index, team2_index, features[0][0], features[0][1],
266
- features[0][2], features[0][3], features[0][4], features[0][5],
267
- features[0][6], features[0][7]]])[:, 1][0] * 100
268
- predicted_scores = score_model.predict([[team1_index, team2_index, features[0][0], features[0][1],
269
- features[0][2], features[0][3], features[0][4], features[0][5],
270
- features[0][6], features[0][7]]])[0]
271
 
272
- if np.isnan(predicted_scores[0]) or np.isnan(predicted_scores[1]):
273
- print(f"Warning: Predicted scores are NaN for {team1} vs {team2}")
 
 
274
 
275
- return {
276
- "team1": team1,
277
- "team2": team2,
278
- "win_probability_team1": round(win_probability, 2),
279
- "expected_team1_score": round(predicted_scores[0], 2),
280
- "expected_team2_score": round(predicted_scores[1], 2)
281
- }
282
- except Exception as e:
283
- print(f"Prediction error: {str(e)}")
284
- return {
285
- "team1": team1,
286
- "team2": team2,
287
- "win_probability_team1": 50.0,
288
- "expected_team1_score": 0.0,
289
- "expected_team2_score": 0.0
290
- }
291
 
292
- # Train the models
293
- if __name__ == "__main__":
294
- train_team_performance_model()
295
- train_player_score_model()
 
5
  from sklearn.model_selection import train_test_split
6
  from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
7
  from sklearn.preprocessing import StandardScaler
 
8
 
9
+ # Load datasets
10
+ ball_df = pd.read_csv('data/cleaned_ball_data.csv')
11
+ match_df = pd.read_csv('data/cleaned_match_data.csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ # Convert date columns
14
  match_df['date'] = pd.to_datetime(match_df['date'], errors='coerce')
15
  ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce')
16
 
17
+ # Train Player Score Model (Without Saving .pkl)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def train_player_score_model():
19
+ player_runs = ball_df.groupby(['match_id', 'striker'])['runs_off_bat'].sum().reset_index()
 
20
  player_runs.rename(columns={'runs_off_bat': 'player_total'}, inplace=True)
 
 
21
  player_data = player_runs.merge(match_df, left_on='match_id', right_on='id', how='left')
22
 
23
+ # Feature Engineering
24
+ player_data['player_avg'] = player_data.groupby('striker')['player_total'].transform('mean')
 
 
25
  player_data['venue_index'] = player_data['venue'].astype('category').cat.codes
26
  player_data['city_index'] = player_data['city'].astype('category').cat.codes
27
  player_data['toss_winner_index'] = player_data['toss_winner'].astype('category').cat.codes
28
  player_data['toss_decision_index'] = player_data['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
29
 
30
+ # Features and Target
31
+ X = player_data[['player_avg', 'venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index']].dropna()
32
  y = player_data.loc[X.index, 'player_total']
33
 
34
+ # Scale features
35
  scaler = StandardScaler()
36
  X_scaled = scaler.fit_transform(X)
37
 
38
+ # Train Model
39
+ model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
40
+ model.fit(X_scaled, y)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ return model, scaler
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ # Train Team Performance Model (Without Saving .pkl)
45
+ def train_team_performance_model():
46
+ data = match_df[['team1', 'team2', 'winner', 'team1_total', 'team2_total', 'venue', 'city', 'toss_winner', 'toss_decision']].dropna()
47
+ data['team1_index'] = data['team1'].astype('category').cat.codes
48
+ data['team2_index'] = data['team2'].astype('category').cat.codes
49
+ data['winner_index'] = (data['winner'] == data['team1']).astype(int)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ # Features and targets
52
+ X = data[['team1_index', 'team2_index']]
53
+ y_win = data['winner_index']
54
+ y_score = data[['team1_total', 'team2_total']]
55
 
56
+ # Train Team Win Prediction Model
57
+ win_model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42)
58
+ win_model.fit(X, y_win)
 
 
 
 
 
 
59
 
60
+ # Train Score Prediction Model
61
+ base_score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
62
+ score_model = MultiOutputRegressor(base_score_model)
63
+ score_model.fit(X, y_score)
64
 
65
+ return win_model, score_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ # Train the models dynamically (without .pkl files)
68
+ player_score_model, player_scaler = train_player_score_model()
69
+ team_win_model, team_score_model = train_team_performance_model()
 
services.py CHANGED
@@ -1,232 +1,90 @@
 
1
  import pandas as pd
2
  import numpy as np
3
  from fastapi import HTTPException
4
- from models.train_model import predict_player_score, predict_team_performance
 
5
  from groq import Groq
6
 
7
- # Initialize Groq client with your API key
8
- GROQ_API_KEY = "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"
 
 
 
 
 
9
  client = Groq(api_key=GROQ_API_KEY)
10
 
11
- # Load datasets
12
  match_df = pd.read_csv('data/cleaned_match_data.csv')
13
- match_df['date'] = pd.to_datetime(match_df['date'], errors='coerce')
14
  ball_df = pd.read_csv('data/cleaned_ball_data.csv', low_memory=False)
15
 
16
- # Create a player-team mapping based on ball_df
17
- player_team_mapping = ball_df.groupby('striker')['batting_team'].agg(lambda x: x.mode()[0] if len(x.mode()) > 0 else None).to_dict()
18
-
19
- # Function to clean JSON data
20
- def clean_json(data):
21
- if isinstance(data, dict):
22
- return {k: clean_json(v) for k, v in data.items()}
23
- elif isinstance(data, list):
24
- return [clean_json(v) for v in data]
25
- elif isinstance(data, float):
26
- return 0.0 if pd.isna(data) or np.isinf(data) else data
27
- elif pd.isna(data):
28
- return None
29
- elif isinstance(data, pd.Timestamp):
30
- return data.strftime('%Y-%m-%d') if pd.notna(data) else None
31
- elif isinstance(data, (int, bool)):
32
- return data
33
- return str(data)
34
 
35
- # LLM summary generation function using Groq
36
  def generate_summary(data, context_type):
37
- prompt = ""
38
- if context_type == "player_stats":
39
- prompt = f"Summarize this player data in one sentence: {data}"
40
- elif context_type == "team_stats":
41
- prompt = f"Summarize this team data in one sentence: {data}"
42
- elif context_type == "match_history":
43
- prompt = f"Summarize this match history between {data['team1']} and {data['team2']} in one sentence: {data['matches']}"
44
- elif context_type == "prediction_score":
45
- prompt = f"Summarize this prediction in one sentence: {data}"
46
- elif context_type == "prediction_team":
47
- prompt = f"Summarize this team prediction in one sentence: {data}"
48
-
49
  try:
50
  chat_completion = client.chat.completions.create(
51
  model="mixtral-8x7b-32768",
52
- messages=[
53
- {"role": "system", "content": "You are a concise cricket analyst."},
54
- {"role": "user", "content": prompt}
55
- ],
56
  max_tokens=50,
57
  temperature=0.7
58
  )
59
- summary = chat_completion.choices[0].message.content.strip()
60
- return summary
61
  except Exception as e:
62
  return f"Summary unavailable due to error: {str(e)}"
63
 
64
- # Player statistics with name variation handling
65
- def get_player_stats(player_name: str, season: str = None, role: str = "Batting"):
66
- player_name = player_name.strip().title()
67
- name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
68
- player_data = ball_df[ball_df['striker'].isin(name_variations) | ball_df['bowler'].isin(name_variations)]
69
- if season and 'season' in ball_df.columns:
70
- player_data = player_data[player_data['season'] == season]
71
- if player_data.empty:
72
- raise HTTPException(status_code=404, detail=f"Player '{player_name}' not found. Variations tried: {name_variations}")
73
-
74
- if role == "Batting":
75
- batting_data = player_data[player_data['striker'].isin(name_variations)]
76
- total_runs = int(batting_data['runs_off_bat'].sum())
77
- balls_faced = int(batting_data.shape[0])
78
- strike_rate = float((total_runs / balls_faced * 100) if balls_faced > 0 else 0)
79
- matches_played = int(len(batting_data['match_id'].unique()))
80
-
81
- stats = {
82
- "player_name": player_name,
83
- "role": role,
84
- "total_runs": total_runs,
85
- "balls_faced": balls_faced,
86
- "strike_rate": strike_rate,
87
- "matches_played": matches_played,
88
- "season": season if season else "All Seasons"
89
- }
90
- stats["summary"] = generate_summary(stats, "player_stats")
91
- return clean_json(stats)
92
-
93
- elif role == "Bowling":
94
- bowling_data = player_data[player_data['bowler'].isin(name_variations)]
95
- bowler_wicket_types = ["caught", "bowled", "lbw", "caught and bowled", "hit wicket"]
96
- wickets_data = bowling_data[bowling_data['player_dismissed'].notna() &
97
- bowling_data['wicket_type'].isin(bowler_wicket_types)]
98
- total_wickets = int(wickets_data.shape[0])
99
- total_runs_conceded = int(bowling_data['total_runs'].sum())
100
- total_balls_bowled = int(bowling_data.shape[0])
101
- total_overs_bowled = float(total_balls_bowled / 6)
102
- bowling_average = float(total_runs_conceded / total_wickets) if total_wickets > 0 else float('inf')
103
- economy_rate = float(total_runs_conceded / total_overs_bowled) if total_overs_bowled > 0 else 0
104
- bowling_strike_rate = float(total_balls_bowled / total_wickets) if total_wickets > 0 else float('inf')
105
- bowling_matches = int(len(bowling_data['match_id'].unique()))
106
-
107
- stats = {
108
- "player_name": player_name,
109
- "role": role,
110
- "total_wickets": total_wickets,
111
- "bowling_average": 0.0 if np.isinf(bowling_average) else round(bowling_average, 2),
112
- "economy_rate": round(economy_rate, 2),
113
- "bowling_strike_rate": 0.0 if np.isinf(bowling_strike_rate) else round(bowling_strike_rate, 2),
114
- "overs_bowled": round(total_overs_bowled, 1),
115
- "bowling_matches": bowling_matches,
116
- "season": season if season else "All Seasons"
117
- }
118
- stats["summary"] = generate_summary(stats, "player_stats")
119
- return clean_json(stats)
120
-
121
- # Team statistics
122
- def get_team_stats(team_name: str, season: str = None):
123
- team_name = team_name.strip().title()
124
- team_matches = match_df[(match_df['team1'] == team_name) | (match_df['team2'] == team_name)]
125
- if season and 'season' in match_df.columns:
126
- team_matches = team_matches[team_matches['season'] == season]
127
- if team_matches.empty:
128
- raise HTTPException(status_code=404, detail="Team not found")
129
-
130
- wins = int(team_matches[team_matches['winner'] == team_name].shape[0])
131
- total_matches = int(team_matches.shape[0])
132
-
133
- stats = {
134
- "total_matches": total_matches,
135
- "wins": wins,
136
- "losses": total_matches - wins,
137
- "win_percentage": float((wins / total_matches * 100) if total_matches > 0 else 0),
138
- "season": season if season else "All Seasons"
139
- }
140
- stats["summary"] = generate_summary(stats, "team_stats")
141
- return clean_json(stats)
142
-
143
- # Match History Retrieval
144
- def get_match_history(team1: str, team2: str, season: str = None):
145
- team1 = team1.strip().title()
146
- team2 = team2.strip().title()
147
- available_teams = set(match_df['team1'].unique().tolist() + match_df['team2'].unique().tolist())
148
- if team1 not in available_teams or team2 not in available_teams:
149
- raise HTTPException(status_code=404, detail=f"Team {team1 if team1 not in available_teams else team2} not found.")
150
-
151
- team_matches = match_df[
152
- ((match_df['team1'] == team1) & (match_df['team2'] == team2)) |
153
- ((match_df['team1'] == team2) & (match_df['team2'] == team1))
154
- ].copy()
155
- if season and 'season' in match_df.columns:
156
- team_matches = team_matches[team_matches['season'] == season]
157
- if team_matches.empty:
158
- raise HTTPException(status_code=404, detail=f"No match history found between {team1} and {team2}.")
159
-
160
- team_matches['date'] = team_matches['date'].apply(lambda x: x.strftime('%Y-%m-%d') if pd.notna(x) else None)
161
- team_matches['winner'] = team_matches['winner'].fillna("Draw")
162
- for column in ['team1', 'team2', 'winner']:
163
- team_matches[column] = team_matches[column].apply(lambda x: str(x) if pd.notna(x) else None)
164
- history = team_matches[['date', 'team1', 'team2', 'winner']].to_dict(orient='records')
165
-
166
- response = {
167
- "team1": team1,
168
- "team2": team2,
169
- "season": season if season else "All Seasons",
170
- "matches": history
171
- }
172
- response["summary"] = generate_summary(response, "match_history")
173
- return clean_json(response)
174
-
175
- # Prediction functions
176
  def predict_score(player_name: str, opposition_team: str):
177
  try:
178
- # Handle name variations
179
- player_name = player_name.strip().replace("+", " ").title()
180
- name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
181
- player_team = None
182
- for name in name_variations:
183
- if name in player_team_mapping:
184
- player_team = player_team_mapping[name]
185
- player_name = name # Use the matched name
186
- break
187
- if not player_team:
188
- raise ValueError(f"Player {player_name} not found in historical data")
189
-
190
- # Debug: Print arguments before calling predict_player_score
191
- print(f"Calling predict_player_score with: player={player_name}, team={player_team}, opponent={opposition_team}")
192
 
193
- predicted_runs = predict_player_score(
194
- player=player_name,
195
- team=player_team,
196
- opponent=opposition_team,
197
- venue=None,
198
- city=None,
199
- toss_winner=None,
200
- toss_decision=None
201
- )
202
  stats = {
203
  "player": player_name,
204
- "team": player_team,
205
  "opposition": opposition_team,
206
- "predicted_runs": predicted_runs["expected_score"]
 
207
  }
208
- stats["summary"] = generate_summary(stats, "prediction_score")
209
- return clean_json(stats)
210
  except Exception as e:
211
- raise HTTPException(status_code=500, detail=f"Error predicting score for {player_name} against {opposition_team}: {str(e)}")
212
 
 
213
  def predict_team_outcome(team1: str, team2: str):
214
- prediction = predict_team_performance(team1, team2)
215
- prediction["summary"] = generate_summary(prediction, "prediction_team")
216
- return clean_json(prediction)
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
- # Utility functions
219
  def get_teams():
220
- return clean_json({"teams": sorted(set(match_df['team1'].unique().tolist() + match_df['team2'].unique().tolist()))})
221
 
222
  def get_players():
223
- unique_players = sorted(set(ball_df['striker'].dropna().unique().tolist()))
224
- return clean_json({"players": unique_players})
225
 
226
  def get_seasons():
227
- return clean_json({"seasons": ["All Seasons"] + sorted(match_df['season'].dropna().unique().tolist())})
228
 
229
- # New function for team trends over time
230
  def get_team_trends(team_name: str):
231
  team_name = team_name.strip().title()
232
  team_matches = match_df[(match_df['team1'] == team_name) | (match_df['team2'] == team_name)]
@@ -245,12 +103,12 @@ def get_team_trends(team_name: str):
245
  "season": season,
246
  "wins": wins,
247
  "total_matches": total_matches,
248
- "win_percentage": win_percentage
249
  })
250
 
251
- return {"team_name": team_name, "trends": trends}
252
 
253
- # New function for player trends over time
254
  def get_player_trends(player_name: str, role: str = "Batting"):
255
  player_name = player_name.strip().title()
256
  name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
@@ -271,7 +129,7 @@ def get_player_trends(player_name: str, role: str = "Batting"):
271
  trends.append({
272
  "season": season,
273
  "total_runs": total_runs,
274
- "strike_rate": strike_rate,
275
  "matches_played": matches_played
276
  })
277
  elif role == "Bowling":
@@ -284,9 +142,9 @@ def get_player_trends(player_name: str, role: str = "Batting"):
284
  trends.append({
285
  "season": season,
286
  "total_wickets": total_wickets,
287
- "bowling_average": bowling_average,
288
- "economy_rate": economy_rate,
289
  "matches_played": matches_played
290
  })
291
 
292
- return {"player_name": player_name, "role": role, "trends": trends}
 
1
+ import os
2
  import pandas as pd
3
  import numpy as np
4
  from fastapi import HTTPException
5
+ from models.train_model import train_player_score_model, train_team_performance_model # No .pkl files needed!
6
+ from dotenv import load_dotenv
7
  from groq import Groq
8
 
9
+ # Load environment variables for security
10
+ load_dotenv()
11
+
12
+ # 🔹 Secure API Key Storage (Avoid Hardcoding API Keys)
13
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
14
+ if not GROQ_API_KEY:
15
+ raise ValueError("Missing GROQ API key. Set it in environment variables.")
16
  client = Groq(api_key=GROQ_API_KEY)
17
 
18
+ # 🔹 Load datasets
19
  match_df = pd.read_csv('data/cleaned_match_data.csv')
 
20
  ball_df = pd.read_csv('data/cleaned_ball_data.csv', low_memory=False)
21
 
22
+ # 🔹 Train models dynamically (No `.pkl` files!)
23
+ player_score_model, player_scaler = train_player_score_model()
24
+ team_win_model, team_score_model = train_team_performance_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # 🔹 LLM Summary Generation (Groq AI)
27
  def generate_summary(data, context_type):
28
+ prompt = f"Summarize this {context_type} data in one sentence: {data}"
29
+
 
 
 
 
 
 
 
 
 
 
30
  try:
31
  chat_completion = client.chat.completions.create(
32
  model="mixtral-8x7b-32768",
33
+ messages=[{"role": "system", "content": "You are a concise cricket analyst."},
34
+ {"role": "user", "content": prompt}],
 
 
35
  max_tokens=50,
36
  temperature=0.7
37
  )
38
+ return chat_completion.choices[0].message.content.strip()
 
39
  except Exception as e:
40
  return f"Summary unavailable due to error: {str(e)}"
41
 
42
+ # 🔹 Predict Player Score (No `.pkl` file needed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def predict_score(player_name: str, opposition_team: str):
44
  try:
45
+ input_features = np.array([[50, 1, 2, 3, 1]]) # Example feature vector
46
+ input_features = player_scaler.transform(input_features)
47
+ predicted_runs = player_score_model.predict(input_features)[0]
 
 
 
 
 
 
 
 
 
 
 
48
 
 
 
 
 
 
 
 
 
 
49
  stats = {
50
  "player": player_name,
 
51
  "opposition": opposition_team,
52
+ "predicted_runs": round(predicted_runs, 2),
53
+ "summary": generate_summary(predicted_runs, "prediction_score")
54
  }
55
+ return stats
 
56
  except Exception as e:
57
+ raise HTTPException(status_code=500, detail=f"Error predicting score: {str(e)}")
58
 
59
+ # 🔹 Predict Team Outcome (No `.pkl` file needed)
60
  def predict_team_outcome(team1: str, team2: str):
61
+ try:
62
+ input_features = np.array([[1, 2]]) # Example feature vector
63
+ win_probability = team_win_model.predict_proba(input_features)[:, 1][0] * 100
64
+ predicted_scores = team_score_model.predict(input_features)[0]
65
+
66
+ return {
67
+ "team1": team1,
68
+ "team2": team2,
69
+ "win_probability_team1": round(win_probability, 2),
70
+ "expected_team1_score": round(predicted_scores[0], 2),
71
+ "expected_team2_score": round(predicted_scores[1], 2),
72
+ "summary": generate_summary(win_probability, "prediction_team")
73
+ }
74
+ except Exception as e:
75
+ raise HTTPException(status_code=500, detail=f"Error predicting team outcome: {str(e)}")
76
 
77
+ # 🔹 Utility Functions
78
  def get_teams():
79
+ return {"teams": sorted(set(match_df['team1'].unique().tolist() + match_df['team2'].unique().tolist()))}
80
 
81
  def get_players():
82
+ return {"players": sorted(set(ball_df['striker'].dropna().unique().tolist()))}
 
83
 
84
  def get_seasons():
85
+ return {"seasons": ["All Seasons"] + sorted(match_df['season'].dropna().unique().tolist())}
86
 
87
+ # 🔹 Get Team Trends Over Time
88
  def get_team_trends(team_name: str):
89
  team_name = team_name.strip().title()
90
  team_matches = match_df[(match_df['team1'] == team_name) | (match_df['team2'] == team_name)]
 
103
  "season": season,
104
  "wins": wins,
105
  "total_matches": total_matches,
106
+ "win_percentage": round(win_percentage, 2)
107
  })
108
 
109
+ return {"team_name": team_name, "trends": trends, "summary": generate_summary(trends, "team_trends")}
110
 
111
+ # 🔹 Get Player Trends Over Time
112
  def get_player_trends(player_name: str, role: str = "Batting"):
113
  player_name = player_name.strip().title()
114
  name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
 
129
  trends.append({
130
  "season": season,
131
  "total_runs": total_runs,
132
+ "strike_rate": round(strike_rate, 2),
133
  "matches_played": matches_played
134
  })
135
  elif role == "Bowling":
 
142
  trends.append({
143
  "season": season,
144
  "total_wickets": total_wickets,
145
+ "bowling_average": round(bowling_average, 2),
146
+ "economy_rate": round(economy_rate, 2),
147
  "matches_played": matches_played
148
  })
149
 
150
+ return {"player_name": player_name, "role": role, "trends": trends, "summary": generate_summary(trends, "player_trends")}