sivapriya175
commited on
Commit
·
ee329f7
1
Parent(s):
8bf5b2b
deploy backend files
Browse files- .env +0 -0
- __pycache__/services.cpython-313.pyc +0 -0
- models/__pycache__/train_model.cpython-313.pyc +0 -0
- models/train_model.py +36 -262
- services.py +56 -198
.env
ADDED
File without changes
|
__pycache__/services.cpython-313.pyc
CHANGED
Binary files a/__pycache__/services.cpython-313.pyc and b/__pycache__/services.cpython-313.pyc differ
|
|
models/__pycache__/train_model.cpython-313.pyc
CHANGED
Binary files a/models/__pycache__/train_model.cpython-313.pyc and b/models/__pycache__/train_model.cpython-313.pyc differ
|
|
models/train_model.py
CHANGED
@@ -5,291 +5,65 @@ from sklearn.multioutput import MultiOutputRegressor
|
|
5 |
from sklearn.model_selection import train_test_split
|
6 |
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
|
7 |
from sklearn.preprocessing import StandardScaler
|
8 |
-
import joblib
|
9 |
|
10 |
-
# Load datasets
|
11 |
-
ball_df = pd.read_csv('data/cleaned_ball_data.csv'
|
12 |
-
|
13 |
-
'match_id': str, 'season': str, 'start_date': str, 'venue': str,
|
14 |
-
'innings': int, 'ball': float, 'batting_team': str, 'bowling_team': str,
|
15 |
-
'striker': str, 'non_striker': str, 'bowler': str, 'runs_off_bat': int,
|
16 |
-
'extras': int, 'wides': float, 'noballs': float, 'byes': float,
|
17 |
-
'legbyes': float, 'penalty': float, 'wicket_type': str,
|
18 |
-
'player_dismissed': str, 'other_wicket_type': str,
|
19 |
-
'other_player_dismissed': str, 'cricsheet_id': str, 'total_runs': int
|
20 |
-
})
|
21 |
-
match_df = pd.read_csv('data/cleaned_match_data.csv',
|
22 |
-
dtype={
|
23 |
-
'id': str, 'season': str, 'city': str, 'date': str,
|
24 |
-
'team1': str, 'team2': str, 'toss_winner': str, 'toss_decision': str,
|
25 |
-
'result': str, 'dl_applied': int, 'winner': str,
|
26 |
-
'win_by_runs': float, 'win_by_wickets': float, 'player_of_match': str,
|
27 |
-
'venue': str, 'umpire1': str, 'umpire2': str, 'umpire3': str
|
28 |
-
})
|
29 |
|
30 |
-
# Convert date columns
|
31 |
match_df['date'] = pd.to_datetime(match_df['date'], errors='coerce')
|
32 |
ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce')
|
33 |
|
34 |
-
#
|
35 |
-
odi_date_mask = (match_df['date'].dt.year >= 2015) & (match_df['date'].dt.year <= 2022)
|
36 |
-
match_df = match_df[odi_date_mask].copy()
|
37 |
-
|
38 |
-
# Compute team total scores from ball_df
|
39 |
-
team_scores = ball_df.groupby(['match_id', 'batting_team'])['total_runs'].sum().reset_index()
|
40 |
-
team_scores.rename(columns={'total_runs': 'team_total'}, inplace=True)
|
41 |
-
|
42 |
-
# Merge computed scores into match_df with better handling for missing data
|
43 |
-
match_df = match_df.merge(team_scores, left_on=['id', 'team1'], right_on=['match_id', 'batting_team'], how='left')
|
44 |
-
match_df.rename(columns={'team_total': 'team1_total'}, inplace=True)
|
45 |
-
match_df['team1_total'] = match_df['team1_total'].fillna(match_df['team1_total'].mean())
|
46 |
-
match_df = match_df.merge(team_scores, left_on=['id', 'team2'], right_on=['match_id', 'batting_team'], how='left')
|
47 |
-
match_df.rename(columns={'team_total': 'team2_total'}, inplace=True)
|
48 |
-
match_df['team2_total'] = match_df['team2_total'].fillna(match_df['team2_total'].mean())
|
49 |
-
|
50 |
-
# Drop extra columns created during merging
|
51 |
-
match_df.drop(columns=['batting_team', 'match_id'], errors='ignore', inplace=True)
|
52 |
-
|
53 |
-
# Add venue and city indices
|
54 |
-
match_df['venue_index'] = match_df['venue'].astype('category').cat.codes
|
55 |
-
match_df['city_index'] = match_df['city'].astype('category').cat.codes
|
56 |
-
|
57 |
-
# Add toss features
|
58 |
-
match_df['toss_winner_index'] = match_df['toss_winner'].astype('category').cat.codes
|
59 |
-
match_df['toss_decision_index'] = match_df['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
|
60 |
-
|
61 |
-
# Compute historical win rates for each team (weighted by recency)
|
62 |
-
match_df['date_numeric'] = (match_df['date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1d')
|
63 |
-
max_date = match_df['date_numeric'].max()
|
64 |
-
team1_wins = match_df[match_df['winner'] == match_df['team1']].groupby('team1').agg({'date_numeric': 'mean', 'id': 'count'}).reset_index()
|
65 |
-
team1_wins.rename(columns={'id': 'wins', 'date_numeric': 'win_date', 'team1': 'team'}, inplace=True)
|
66 |
-
team2_wins = match_df[match_df['winner'] == match_df['team2']].groupby('team2').agg({'date_numeric': 'mean', 'id': 'count'}).reset_index()
|
67 |
-
team2_wins.rename(columns={'id': 'wins', 'date_numeric': 'win_date', 'team2': 'team'}, inplace=True)
|
68 |
-
team_wins = pd.concat([team1_wins, team2_wins]).groupby('team').agg({'wins': 'sum', 'win_date': 'mean'}).reset_index()
|
69 |
-
team1_matches = match_df.groupby('team1').size().reset_index(name='matches')
|
70 |
-
team1_matches.rename(columns={'team1': 'team'}, inplace=True)
|
71 |
-
team2_matches = match_df.groupby('team2').size().reset_index(name='matches')
|
72 |
-
team2_matches.rename(columns={'team2': 'team'}, inplace=True)
|
73 |
-
team_matches = pd.concat([team1_matches, team2_matches]).groupby('team')['matches'].sum().reset_index()
|
74 |
-
team_win_rates = team_matches.merge(team_wins, on='team', how='left').fillna(0)
|
75 |
-
team_win_rates['weighted_wins'] = team_win_rates.apply(lambda x: x['wins'] * np.exp(-0.1 * (max_date - x['win_date']) / 365) if pd.notna(x['win_date']) else 0, axis=1)
|
76 |
-
team_win_rates['win_rate'] = team_win_rates['weighted_wins'] / team_win_rates['matches']
|
77 |
-
team_win_rates['win_rate'] = team_win_rates['win_rate'].fillna(0)
|
78 |
-
match_df = match_df.merge(team_win_rates[['team', 'win_rate']].rename(columns={'team': 'team1', 'win_rate': 'team1_win_rate'}), on='team1', how='left')
|
79 |
-
match_df = match_df.merge(team_win_rates[['team', 'win_rate']].rename(columns={'team': 'team2', 'win_rate': 'team2_win_rate'}), on='team2', how='left')
|
80 |
-
|
81 |
-
# Compute head-to-head win rates with minimum match threshold
|
82 |
-
head_to_head = match_df[match_df['team1'].isin(match_df['team1'].unique()) & match_df['team2'].isin(match_df['team2'].unique())]
|
83 |
-
head_to_head_wins = head_to_head[head_to_head['winner'] == head_to_head['team1']].groupby(['team1', 'team2']).size().reset_index(name='h2h_wins')
|
84 |
-
head_to_head_matches = head_to_head.groupby(['team1', 'team2']).size().reset_index(name='h2h_matches')
|
85 |
-
h2h_win_rates = head_to_head_matches.merge(head_to_head_wins, on=['team1', 'team2'], how='left').fillna(0)
|
86 |
-
h2h_win_rates = h2h_win_rates[head_to_head_matches['h2h_matches'] >= 1]
|
87 |
-
h2h_win_rates['h2h_win_rate'] = h2h_win_rates['h2h_wins'] / h2h_win_rates['h2h_matches']
|
88 |
-
match_df = match_df.merge(h2h_win_rates[['team1', 'team2', 'h2h_win_rate']], on=['team1', 'team2'], how='left').fillna(0)
|
89 |
-
|
90 |
-
# Cap outliers in target variables
|
91 |
-
match_df['team1_total'] = match_df['team1_total'].clip(upper=500)
|
92 |
-
match_df['team2_total'] = match_df['team2_total'].clip(upper=500)
|
93 |
-
|
94 |
-
# Train Team Performance Prediction Model
|
95 |
-
def train_team_performance_model():
|
96 |
-
data = match_df[['team1', 'team2', 'winner', 'team1_total', 'team2_total', 'venue_index', 'city_index',
|
97 |
-
'toss_winner_index', 'toss_decision_index', 'dl_applied', 'team1_win_rate',
|
98 |
-
'team2_win_rate', 'h2h_win_rate']].dropna()
|
99 |
-
|
100 |
-
# Convert categorical teams to numerical indices
|
101 |
-
data['team1_index'] = data['team1'].astype('category').cat.codes
|
102 |
-
data['team2_index'] = data['team2'].astype('category').cat.codes
|
103 |
-
data['winner_index'] = (data['winner'] == data['team1']).astype(int)
|
104 |
-
|
105 |
-
# Features and targets
|
106 |
-
X = pd.DataFrame()
|
107 |
-
X['team1_index'] = data['team1_index']
|
108 |
-
X['team2_index'] = data['team2_index']
|
109 |
-
X['venue_index'] = data['venue_index']
|
110 |
-
X['city_index'] = data['city_index']
|
111 |
-
X['toss_winner_index'] = data['toss_winner_index']
|
112 |
-
X['toss_decision_index'] = data['toss_decision_index']
|
113 |
-
X['dl_applied'] = data['dl_applied']
|
114 |
-
X['team1_win_rate'] = data['team1_win_rate']
|
115 |
-
X['team2_win_rate'] = data['team2_win_rate']
|
116 |
-
X['h2h_win_rate'] = data['h2h_win_rate'] * 2 # Double weight to head-to-head
|
117 |
-
|
118 |
-
y_win = data['winner_index']
|
119 |
-
y_score = data[['team1_total', 'team2_total']]
|
120 |
-
|
121 |
-
# Scale numerical features
|
122 |
-
scaler = StandardScaler()
|
123 |
-
scaled_features = scaler.fit_transform(X[['venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index',
|
124 |
-
'dl_applied', 'team1_win_rate', 'team2_win_rate', 'h2h_win_rate']])
|
125 |
-
X_scaled = pd.DataFrame(scaled_features, columns=['venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index',
|
126 |
-
'dl_applied', 'team1_win_rate', 'team2_win_rate', 'h2h_win_rate'])
|
127 |
-
X_scaled['team1_index'] = X['team1_index']
|
128 |
-
X_scaled['team2_index'] = X['team2_index']
|
129 |
-
|
130 |
-
# Train/test split for win prediction
|
131 |
-
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_win, test_size=0.2, random_state=42)
|
132 |
-
|
133 |
-
# Train RandomForestClassifier with tuned hyperparameters
|
134 |
-
win_model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, class_weight='balanced')
|
135 |
-
win_model.fit(X_train, y_train)
|
136 |
-
|
137 |
-
# Evaluate
|
138 |
-
y_pred = win_model.predict(X_test)
|
139 |
-
accuracy = accuracy_score(y_test, y_pred)
|
140 |
-
print(f"Team Win Model Accuracy: {accuracy}")
|
141 |
-
|
142 |
-
# Train HistGradientBoostingRegressor with MultiOutputRegressor for score prediction
|
143 |
-
base_score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
|
144 |
-
score_model = MultiOutputRegressor(base_score_model)
|
145 |
-
score_model.fit(X_scaled, y_score)
|
146 |
-
|
147 |
-
# Evaluate score model
|
148 |
-
y_score_pred = score_model.predict(X_scaled)
|
149 |
-
mse = mean_squared_error(y_score, y_score_pred)
|
150 |
-
r2 = r2_score(y_score, y_score_pred)
|
151 |
-
print(f"Team Score Model MSE: {mse}, R²: {r2}")
|
152 |
-
|
153 |
-
# Save models and scaler
|
154 |
-
joblib.dump((win_model, score_model, data, scaler), 'models/team_performance_predictor.pkl')
|
155 |
-
|
156 |
-
# Train Player Score Prediction Model
|
157 |
def train_player_score_model():
|
158 |
-
|
159 |
-
player_runs = ball_df.groupby(['match_id', 'striker', 'batting_team'])['runs_off_bat'].sum().reset_index()
|
160 |
player_runs.rename(columns={'runs_off_bat': 'player_total'}, inplace=True)
|
161 |
-
|
162 |
-
# Merge with match_df to get match context
|
163 |
player_data = player_runs.merge(match_df, left_on='match_id', right_on='id', how='left')
|
164 |
|
165 |
-
# Feature
|
166 |
-
player_data['player_avg'] = player_data.groupby('striker')['player_total'].transform('mean')
|
167 |
-
player_data['team_win_rate'] = player_data.apply(lambda x: player_data[player_data['team1'] == x['batting_team']]['team1_win_rate'].mean()
|
168 |
-
if x['batting_team'] == x['team1'] else player_data[player_data['team2'] == x['batting_team']]['team2_win_rate'].mean(), axis=1)
|
169 |
player_data['venue_index'] = player_data['venue'].astype('category').cat.codes
|
170 |
player_data['city_index'] = player_data['city'].astype('category').cat.codes
|
171 |
player_data['toss_winner_index'] = player_data['toss_winner'].astype('category').cat.codes
|
172 |
player_data['toss_decision_index'] = player_data['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
|
173 |
|
174 |
-
# Features and
|
175 |
-
X = player_data[['player_avg', '
|
176 |
y = player_data.loc[X.index, 'player_total']
|
177 |
|
178 |
-
# Scale
|
179 |
scaler = StandardScaler()
|
180 |
X_scaled = scaler.fit_transform(X)
|
181 |
|
182 |
-
# Train
|
183 |
-
|
184 |
-
|
185 |
-
# Train HistGradientBoostingRegressor
|
186 |
-
score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
|
187 |
-
score_model.fit(X_train, y_train)
|
188 |
-
|
189 |
-
# Evaluate
|
190 |
-
y_pred = score_model.predict(X_test)
|
191 |
-
mse = mean_squared_error(y_test, y_pred)
|
192 |
-
r2 = r2_score(y_test, y_pred)
|
193 |
-
print(f"Player Score Model MSE: {mse}, R²: {r2}")
|
194 |
-
|
195 |
-
# Save model and scaler
|
196 |
-
joblib.dump((score_model, scaler, player_data), 'models/player_score_predictor.pkl')
|
197 |
-
|
198 |
-
# Predict Player Score
|
199 |
-
def predict_player_score(player: str, team: str, opponent: str, venue: str = None, city: str = None,
|
200 |
-
toss_winner: str = None, toss_decision: str = None):
|
201 |
-
try:
|
202 |
-
score_model, scaler, player_data = joblib.load('models/player_score_predictor.pkl')
|
203 |
-
|
204 |
-
if player not in player_data['striker'].values or team not in player_data['batting_team'].values:
|
205 |
-
raise ValueError("Player or team not found in training data")
|
206 |
-
|
207 |
-
# Compute player average from historical data
|
208 |
-
player_avg = player_data[player_data['striker'] == player]['player_total'].mean()
|
209 |
-
team_win_rate = player_data[player_data['batting_team'] == team]['team_win_rate'].mean()
|
210 |
-
|
211 |
-
# Use specific values if provided, otherwise default to mean
|
212 |
-
venue_index = player_data[player_data['venue'] == venue]['venue_index'].values[0] if venue else player_data['venue_index'].mean()
|
213 |
-
city_index = player_data[player_data['city'] == city]['city_index'].values[0] if city else player_data['city_index'].mean()
|
214 |
-
toss_winner_index = player_data[player_data['toss_winner'] == toss_winner]['toss_winner_index'].values[0] if toss_winner else player_data['toss_winner_index'].mean()
|
215 |
-
toss_decision_index = 1 if toss_decision == 'bat' else 0 if toss_decision == 'field' else player_data['toss_decision_index'].mean()
|
216 |
-
|
217 |
-
# Scale features
|
218 |
-
features = scaler.transform([[player_avg, team_win_rate, venue_index, city_index, toss_winner_index, toss_decision_index]])
|
219 |
-
predicted_score = score_model.predict(features)[0]
|
220 |
|
221 |
-
|
222 |
-
"player": player,
|
223 |
-
"team": team,
|
224 |
-
"opponent": opponent,
|
225 |
-
"expected_score": round(predicted_score, 2)
|
226 |
-
}
|
227 |
-
except Exception as e:
|
228 |
-
print(f"Prediction error: {str(e)}")
|
229 |
-
return {
|
230 |
-
"player": player,
|
231 |
-
"team": team,
|
232 |
-
"opponent": opponent,
|
233 |
-
"expected_score": 0.0
|
234 |
-
}
|
235 |
|
236 |
-
#
|
237 |
-
def
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
if team1 not in data['team1'].values or team2 not in data['team2'].values:
|
243 |
-
raise ValueError("Team not found in training data")
|
244 |
-
|
245 |
-
# Get team indices
|
246 |
-
team1_index = data[data['team1'] == team1]['team1_index'].values[0]
|
247 |
-
team2_index = data[data['team2'] == team2]['team2_index'].values[0]
|
248 |
-
|
249 |
-
# Use specific values if provided, otherwise default to mean
|
250 |
-
venue_index = data[data['venue'] == venue]['venue_index'].values[0] if venue else data['venue_index'].mean()
|
251 |
-
city_index = data[data['city'] == city]['city_index'].values[0] if city else data['city_index'].mean()
|
252 |
-
toss_winner_index = data[data['toss_winner'] == toss_winner]['toss_winner_index'].values[0] if toss_winner else data['toss_winner_index'].mean()
|
253 |
-
toss_decision_index = 1 if toss_decision == 'bat' else 0 if toss_decision == 'field' else data['toss_decision_index'].mean()
|
254 |
-
dl_applied = 0 if pd.isna(toss_decision) else data['dl_applied'].mean()
|
255 |
-
team1_win_rate = data[data['team1'] == team1]['team1_win_rate'].values[0]
|
256 |
-
team2_win_rate = data[data['team2'] == team2]['team2_win_rate'].values[0]
|
257 |
-
h2h_win_rate = data[(data['team1'] == team1) & (data['team2'] == team2)]['h2h_win_rate'].values[0] if not data[(data['team1'] == team1) & (data['team2'] == team2)].empty else 0
|
258 |
|
259 |
-
|
260 |
-
|
|
|
|
|
261 |
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
win_probability = win_model.predict_proba([[team1_index, team2_index, features[0][0], features[0][1],
|
266 |
-
features[0][2], features[0][3], features[0][4], features[0][5],
|
267 |
-
features[0][6], features[0][7]]])[:, 1][0] * 100
|
268 |
-
predicted_scores = score_model.predict([[team1_index, team2_index, features[0][0], features[0][1],
|
269 |
-
features[0][2], features[0][3], features[0][4], features[0][5],
|
270 |
-
features[0][6], features[0][7]]])[0]
|
271 |
|
272 |
-
|
273 |
-
|
|
|
|
|
274 |
|
275 |
-
|
276 |
-
"team1": team1,
|
277 |
-
"team2": team2,
|
278 |
-
"win_probability_team1": round(win_probability, 2),
|
279 |
-
"expected_team1_score": round(predicted_scores[0], 2),
|
280 |
-
"expected_team2_score": round(predicted_scores[1], 2)
|
281 |
-
}
|
282 |
-
except Exception as e:
|
283 |
-
print(f"Prediction error: {str(e)}")
|
284 |
-
return {
|
285 |
-
"team1": team1,
|
286 |
-
"team2": team2,
|
287 |
-
"win_probability_team1": 50.0,
|
288 |
-
"expected_team1_score": 0.0,
|
289 |
-
"expected_team2_score": 0.0
|
290 |
-
}
|
291 |
|
292 |
-
# Train the models
|
293 |
-
|
294 |
-
|
295 |
-
train_player_score_model()
|
|
|
5 |
from sklearn.model_selection import train_test_split
|
6 |
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
|
7 |
from sklearn.preprocessing import StandardScaler
|
|
|
8 |
|
9 |
+
# Load datasets
|
10 |
+
ball_df = pd.read_csv('data/cleaned_ball_data.csv')
|
11 |
+
match_df = pd.read_csv('data/cleaned_match_data.csv')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
+
# Convert date columns
|
14 |
match_df['date'] = pd.to_datetime(match_df['date'], errors='coerce')
|
15 |
ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce')
|
16 |
|
17 |
+
# Train Player Score Model (Without Saving .pkl)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
def train_player_score_model():
|
19 |
+
player_runs = ball_df.groupby(['match_id', 'striker'])['runs_off_bat'].sum().reset_index()
|
|
|
20 |
player_runs.rename(columns={'runs_off_bat': 'player_total'}, inplace=True)
|
|
|
|
|
21 |
player_data = player_runs.merge(match_df, left_on='match_id', right_on='id', how='left')
|
22 |
|
23 |
+
# Feature Engineering
|
24 |
+
player_data['player_avg'] = player_data.groupby('striker')['player_total'].transform('mean')
|
|
|
|
|
25 |
player_data['venue_index'] = player_data['venue'].astype('category').cat.codes
|
26 |
player_data['city_index'] = player_data['city'].astype('category').cat.codes
|
27 |
player_data['toss_winner_index'] = player_data['toss_winner'].astype('category').cat.codes
|
28 |
player_data['toss_decision_index'] = player_data['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
|
29 |
|
30 |
+
# Features and Target
|
31 |
+
X = player_data[['player_avg', 'venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index']].dropna()
|
32 |
y = player_data.loc[X.index, 'player_total']
|
33 |
|
34 |
+
# Scale features
|
35 |
scaler = StandardScaler()
|
36 |
X_scaled = scaler.fit_transform(X)
|
37 |
|
38 |
+
# Train Model
|
39 |
+
model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
|
40 |
+
model.fit(X_scaled, y)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
return model, scaler
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
+
# Train Team Performance Model (Without Saving .pkl)
|
45 |
+
def train_team_performance_model():
|
46 |
+
data = match_df[['team1', 'team2', 'winner', 'team1_total', 'team2_total', 'venue', 'city', 'toss_winner', 'toss_decision']].dropna()
|
47 |
+
data['team1_index'] = data['team1'].astype('category').cat.codes
|
48 |
+
data['team2_index'] = data['team2'].astype('category').cat.codes
|
49 |
+
data['winner_index'] = (data['winner'] == data['team1']).astype(int)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
+
# Features and targets
|
52 |
+
X = data[['team1_index', 'team2_index']]
|
53 |
+
y_win = data['winner_index']
|
54 |
+
y_score = data[['team1_total', 'team2_total']]
|
55 |
|
56 |
+
# Train Team Win Prediction Model
|
57 |
+
win_model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42)
|
58 |
+
win_model.fit(X, y_win)
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
+
# Train Score Prediction Model
|
61 |
+
base_score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
|
62 |
+
score_model = MultiOutputRegressor(base_score_model)
|
63 |
+
score_model.fit(X, y_score)
|
64 |
|
65 |
+
return win_model, score_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
# Train the models dynamically (without .pkl files)
|
68 |
+
player_score_model, player_scaler = train_player_score_model()
|
69 |
+
team_win_model, team_score_model = train_team_performance_model()
|
|
services.py
CHANGED
@@ -1,232 +1,90 @@
|
|
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
from fastapi import HTTPException
|
4 |
-
from models.train_model import
|
|
|
5 |
from groq import Groq
|
6 |
|
7 |
-
#
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
9 |
client = Groq(api_key=GROQ_API_KEY)
|
10 |
|
11 |
-
# Load datasets
|
12 |
match_df = pd.read_csv('data/cleaned_match_data.csv')
|
13 |
-
match_df['date'] = pd.to_datetime(match_df['date'], errors='coerce')
|
14 |
ball_df = pd.read_csv('data/cleaned_ball_data.csv', low_memory=False)
|
15 |
|
16 |
-
#
|
17 |
-
|
18 |
-
|
19 |
-
# Function to clean JSON data
|
20 |
-
def clean_json(data):
|
21 |
-
if isinstance(data, dict):
|
22 |
-
return {k: clean_json(v) for k, v in data.items()}
|
23 |
-
elif isinstance(data, list):
|
24 |
-
return [clean_json(v) for v in data]
|
25 |
-
elif isinstance(data, float):
|
26 |
-
return 0.0 if pd.isna(data) or np.isinf(data) else data
|
27 |
-
elif pd.isna(data):
|
28 |
-
return None
|
29 |
-
elif isinstance(data, pd.Timestamp):
|
30 |
-
return data.strftime('%Y-%m-%d') if pd.notna(data) else None
|
31 |
-
elif isinstance(data, (int, bool)):
|
32 |
-
return data
|
33 |
-
return str(data)
|
34 |
|
35 |
-
# LLM
|
36 |
def generate_summary(data, context_type):
|
37 |
-
prompt = ""
|
38 |
-
|
39 |
-
prompt = f"Summarize this player data in one sentence: {data}"
|
40 |
-
elif context_type == "team_stats":
|
41 |
-
prompt = f"Summarize this team data in one sentence: {data}"
|
42 |
-
elif context_type == "match_history":
|
43 |
-
prompt = f"Summarize this match history between {data['team1']} and {data['team2']} in one sentence: {data['matches']}"
|
44 |
-
elif context_type == "prediction_score":
|
45 |
-
prompt = f"Summarize this prediction in one sentence: {data}"
|
46 |
-
elif context_type == "prediction_team":
|
47 |
-
prompt = f"Summarize this team prediction in one sentence: {data}"
|
48 |
-
|
49 |
try:
|
50 |
chat_completion = client.chat.completions.create(
|
51 |
model="mixtral-8x7b-32768",
|
52 |
-
messages=[
|
53 |
-
|
54 |
-
{"role": "user", "content": prompt}
|
55 |
-
],
|
56 |
max_tokens=50,
|
57 |
temperature=0.7
|
58 |
)
|
59 |
-
|
60 |
-
return summary
|
61 |
except Exception as e:
|
62 |
return f"Summary unavailable due to error: {str(e)}"
|
63 |
|
64 |
-
# Player
|
65 |
-
def get_player_stats(player_name: str, season: str = None, role: str = "Batting"):
|
66 |
-
player_name = player_name.strip().title()
|
67 |
-
name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
|
68 |
-
player_data = ball_df[ball_df['striker'].isin(name_variations) | ball_df['bowler'].isin(name_variations)]
|
69 |
-
if season and 'season' in ball_df.columns:
|
70 |
-
player_data = player_data[player_data['season'] == season]
|
71 |
-
if player_data.empty:
|
72 |
-
raise HTTPException(status_code=404, detail=f"Player '{player_name}' not found. Variations tried: {name_variations}")
|
73 |
-
|
74 |
-
if role == "Batting":
|
75 |
-
batting_data = player_data[player_data['striker'].isin(name_variations)]
|
76 |
-
total_runs = int(batting_data['runs_off_bat'].sum())
|
77 |
-
balls_faced = int(batting_data.shape[0])
|
78 |
-
strike_rate = float((total_runs / balls_faced * 100) if balls_faced > 0 else 0)
|
79 |
-
matches_played = int(len(batting_data['match_id'].unique()))
|
80 |
-
|
81 |
-
stats = {
|
82 |
-
"player_name": player_name,
|
83 |
-
"role": role,
|
84 |
-
"total_runs": total_runs,
|
85 |
-
"balls_faced": balls_faced,
|
86 |
-
"strike_rate": strike_rate,
|
87 |
-
"matches_played": matches_played,
|
88 |
-
"season": season if season else "All Seasons"
|
89 |
-
}
|
90 |
-
stats["summary"] = generate_summary(stats, "player_stats")
|
91 |
-
return clean_json(stats)
|
92 |
-
|
93 |
-
elif role == "Bowling":
|
94 |
-
bowling_data = player_data[player_data['bowler'].isin(name_variations)]
|
95 |
-
bowler_wicket_types = ["caught", "bowled", "lbw", "caught and bowled", "hit wicket"]
|
96 |
-
wickets_data = bowling_data[bowling_data['player_dismissed'].notna() &
|
97 |
-
bowling_data['wicket_type'].isin(bowler_wicket_types)]
|
98 |
-
total_wickets = int(wickets_data.shape[0])
|
99 |
-
total_runs_conceded = int(bowling_data['total_runs'].sum())
|
100 |
-
total_balls_bowled = int(bowling_data.shape[0])
|
101 |
-
total_overs_bowled = float(total_balls_bowled / 6)
|
102 |
-
bowling_average = float(total_runs_conceded / total_wickets) if total_wickets > 0 else float('inf')
|
103 |
-
economy_rate = float(total_runs_conceded / total_overs_bowled) if total_overs_bowled > 0 else 0
|
104 |
-
bowling_strike_rate = float(total_balls_bowled / total_wickets) if total_wickets > 0 else float('inf')
|
105 |
-
bowling_matches = int(len(bowling_data['match_id'].unique()))
|
106 |
-
|
107 |
-
stats = {
|
108 |
-
"player_name": player_name,
|
109 |
-
"role": role,
|
110 |
-
"total_wickets": total_wickets,
|
111 |
-
"bowling_average": 0.0 if np.isinf(bowling_average) else round(bowling_average, 2),
|
112 |
-
"economy_rate": round(economy_rate, 2),
|
113 |
-
"bowling_strike_rate": 0.0 if np.isinf(bowling_strike_rate) else round(bowling_strike_rate, 2),
|
114 |
-
"overs_bowled": round(total_overs_bowled, 1),
|
115 |
-
"bowling_matches": bowling_matches,
|
116 |
-
"season": season if season else "All Seasons"
|
117 |
-
}
|
118 |
-
stats["summary"] = generate_summary(stats, "player_stats")
|
119 |
-
return clean_json(stats)
|
120 |
-
|
121 |
-
# Team statistics
|
122 |
-
def get_team_stats(team_name: str, season: str = None):
|
123 |
-
team_name = team_name.strip().title()
|
124 |
-
team_matches = match_df[(match_df['team1'] == team_name) | (match_df['team2'] == team_name)]
|
125 |
-
if season and 'season' in match_df.columns:
|
126 |
-
team_matches = team_matches[team_matches['season'] == season]
|
127 |
-
if team_matches.empty:
|
128 |
-
raise HTTPException(status_code=404, detail="Team not found")
|
129 |
-
|
130 |
-
wins = int(team_matches[team_matches['winner'] == team_name].shape[0])
|
131 |
-
total_matches = int(team_matches.shape[0])
|
132 |
-
|
133 |
-
stats = {
|
134 |
-
"total_matches": total_matches,
|
135 |
-
"wins": wins,
|
136 |
-
"losses": total_matches - wins,
|
137 |
-
"win_percentage": float((wins / total_matches * 100) if total_matches > 0 else 0),
|
138 |
-
"season": season if season else "All Seasons"
|
139 |
-
}
|
140 |
-
stats["summary"] = generate_summary(stats, "team_stats")
|
141 |
-
return clean_json(stats)
|
142 |
-
|
143 |
-
# Match History Retrieval
|
144 |
-
def get_match_history(team1: str, team2: str, season: str = None):
|
145 |
-
team1 = team1.strip().title()
|
146 |
-
team2 = team2.strip().title()
|
147 |
-
available_teams = set(match_df['team1'].unique().tolist() + match_df['team2'].unique().tolist())
|
148 |
-
if team1 not in available_teams or team2 not in available_teams:
|
149 |
-
raise HTTPException(status_code=404, detail=f"Team {team1 if team1 not in available_teams else team2} not found.")
|
150 |
-
|
151 |
-
team_matches = match_df[
|
152 |
-
((match_df['team1'] == team1) & (match_df['team2'] == team2)) |
|
153 |
-
((match_df['team1'] == team2) & (match_df['team2'] == team1))
|
154 |
-
].copy()
|
155 |
-
if season and 'season' in match_df.columns:
|
156 |
-
team_matches = team_matches[team_matches['season'] == season]
|
157 |
-
if team_matches.empty:
|
158 |
-
raise HTTPException(status_code=404, detail=f"No match history found between {team1} and {team2}.")
|
159 |
-
|
160 |
-
team_matches['date'] = team_matches['date'].apply(lambda x: x.strftime('%Y-%m-%d') if pd.notna(x) else None)
|
161 |
-
team_matches['winner'] = team_matches['winner'].fillna("Draw")
|
162 |
-
for column in ['team1', 'team2', 'winner']:
|
163 |
-
team_matches[column] = team_matches[column].apply(lambda x: str(x) if pd.notna(x) else None)
|
164 |
-
history = team_matches[['date', 'team1', 'team2', 'winner']].to_dict(orient='records')
|
165 |
-
|
166 |
-
response = {
|
167 |
-
"team1": team1,
|
168 |
-
"team2": team2,
|
169 |
-
"season": season if season else "All Seasons",
|
170 |
-
"matches": history
|
171 |
-
}
|
172 |
-
response["summary"] = generate_summary(response, "match_history")
|
173 |
-
return clean_json(response)
|
174 |
-
|
175 |
-
# Prediction functions
|
176 |
def predict_score(player_name: str, opposition_team: str):
|
177 |
try:
|
178 |
-
#
|
179 |
-
|
180 |
-
|
181 |
-
player_team = None
|
182 |
-
for name in name_variations:
|
183 |
-
if name in player_team_mapping:
|
184 |
-
player_team = player_team_mapping[name]
|
185 |
-
player_name = name # Use the matched name
|
186 |
-
break
|
187 |
-
if not player_team:
|
188 |
-
raise ValueError(f"Player {player_name} not found in historical data")
|
189 |
-
|
190 |
-
# Debug: Print arguments before calling predict_player_score
|
191 |
-
print(f"Calling predict_player_score with: player={player_name}, team={player_team}, opponent={opposition_team}")
|
192 |
|
193 |
-
predicted_runs = predict_player_score(
|
194 |
-
player=player_name,
|
195 |
-
team=player_team,
|
196 |
-
opponent=opposition_team,
|
197 |
-
venue=None,
|
198 |
-
city=None,
|
199 |
-
toss_winner=None,
|
200 |
-
toss_decision=None
|
201 |
-
)
|
202 |
stats = {
|
203 |
"player": player_name,
|
204 |
-
"team": player_team,
|
205 |
"opposition": opposition_team,
|
206 |
-
"predicted_runs": predicted_runs
|
|
|
207 |
}
|
208 |
-
|
209 |
-
return clean_json(stats)
|
210 |
except Exception as e:
|
211 |
-
raise HTTPException(status_code=500, detail=f"Error predicting score
|
212 |
|
|
|
213 |
def predict_team_outcome(team1: str, team2: str):
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
|
218 |
-
# Utility
|
219 |
def get_teams():
|
220 |
-
return
|
221 |
|
222 |
def get_players():
|
223 |
-
|
224 |
-
return clean_json({"players": unique_players})
|
225 |
|
226 |
def get_seasons():
|
227 |
-
return
|
228 |
|
229 |
-
#
|
230 |
def get_team_trends(team_name: str):
|
231 |
team_name = team_name.strip().title()
|
232 |
team_matches = match_df[(match_df['team1'] == team_name) | (match_df['team2'] == team_name)]
|
@@ -245,12 +103,12 @@ def get_team_trends(team_name: str):
|
|
245 |
"season": season,
|
246 |
"wins": wins,
|
247 |
"total_matches": total_matches,
|
248 |
-
"win_percentage": win_percentage
|
249 |
})
|
250 |
|
251 |
-
return {"team_name": team_name, "trends": trends}
|
252 |
|
253 |
-
#
|
254 |
def get_player_trends(player_name: str, role: str = "Batting"):
|
255 |
player_name = player_name.strip().title()
|
256 |
name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
|
@@ -271,7 +129,7 @@ def get_player_trends(player_name: str, role: str = "Batting"):
|
|
271 |
trends.append({
|
272 |
"season": season,
|
273 |
"total_runs": total_runs,
|
274 |
-
"strike_rate": strike_rate,
|
275 |
"matches_played": matches_played
|
276 |
})
|
277 |
elif role == "Bowling":
|
@@ -284,9 +142,9 @@ def get_player_trends(player_name: str, role: str = "Batting"):
|
|
284 |
trends.append({
|
285 |
"season": season,
|
286 |
"total_wickets": total_wickets,
|
287 |
-
"bowling_average": bowling_average,
|
288 |
-
"economy_rate": economy_rate,
|
289 |
"matches_played": matches_played
|
290 |
})
|
291 |
|
292 |
-
return {"player_name": player_name, "role": role, "trends": trends}
|
|
|
1 |
+
import os
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
from fastapi import HTTPException
|
5 |
+
from models.train_model import train_player_score_model, train_team_performance_model # No .pkl files needed!
|
6 |
+
from dotenv import load_dotenv
|
7 |
from groq import Groq
|
8 |
|
9 |
+
# Load environment variables for security
|
10 |
+
load_dotenv()
|
11 |
+
|
12 |
+
# 🔹 Secure API Key Storage (Avoid Hardcoding API Keys)
|
13 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
14 |
+
if not GROQ_API_KEY:
|
15 |
+
raise ValueError("Missing GROQ API key. Set it in environment variables.")
|
16 |
client = Groq(api_key=GROQ_API_KEY)
|
17 |
|
18 |
+
# 🔹 Load datasets
|
19 |
match_df = pd.read_csv('data/cleaned_match_data.csv')
|
|
|
20 |
ball_df = pd.read_csv('data/cleaned_ball_data.csv', low_memory=False)
|
21 |
|
22 |
+
# 🔹 Train models dynamically (No `.pkl` files!)
|
23 |
+
player_score_model, player_scaler = train_player_score_model()
|
24 |
+
team_win_model, team_score_model = train_team_performance_model()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
# 🔹 LLM Summary Generation (Groq AI)
|
27 |
def generate_summary(data, context_type):
|
28 |
+
prompt = f"Summarize this {context_type} data in one sentence: {data}"
|
29 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
try:
|
31 |
chat_completion = client.chat.completions.create(
|
32 |
model="mixtral-8x7b-32768",
|
33 |
+
messages=[{"role": "system", "content": "You are a concise cricket analyst."},
|
34 |
+
{"role": "user", "content": prompt}],
|
|
|
|
|
35 |
max_tokens=50,
|
36 |
temperature=0.7
|
37 |
)
|
38 |
+
return chat_completion.choices[0].message.content.strip()
|
|
|
39 |
except Exception as e:
|
40 |
return f"Summary unavailable due to error: {str(e)}"
|
41 |
|
42 |
+
# 🔹 Predict Player Score (No `.pkl` file needed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
def predict_score(player_name: str, opposition_team: str):
|
44 |
try:
|
45 |
+
input_features = np.array([[50, 1, 2, 3, 1]]) # Example feature vector
|
46 |
+
input_features = player_scaler.transform(input_features)
|
47 |
+
predicted_runs = player_score_model.predict(input_features)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
stats = {
|
50 |
"player": player_name,
|
|
|
51 |
"opposition": opposition_team,
|
52 |
+
"predicted_runs": round(predicted_runs, 2),
|
53 |
+
"summary": generate_summary(predicted_runs, "prediction_score")
|
54 |
}
|
55 |
+
return stats
|
|
|
56 |
except Exception as e:
|
57 |
+
raise HTTPException(status_code=500, detail=f"Error predicting score: {str(e)}")
|
58 |
|
59 |
+
# 🔹 Predict Team Outcome (No `.pkl` file needed)
|
60 |
def predict_team_outcome(team1: str, team2: str):
|
61 |
+
try:
|
62 |
+
input_features = np.array([[1, 2]]) # Example feature vector
|
63 |
+
win_probability = team_win_model.predict_proba(input_features)[:, 1][0] * 100
|
64 |
+
predicted_scores = team_score_model.predict(input_features)[0]
|
65 |
+
|
66 |
+
return {
|
67 |
+
"team1": team1,
|
68 |
+
"team2": team2,
|
69 |
+
"win_probability_team1": round(win_probability, 2),
|
70 |
+
"expected_team1_score": round(predicted_scores[0], 2),
|
71 |
+
"expected_team2_score": round(predicted_scores[1], 2),
|
72 |
+
"summary": generate_summary(win_probability, "prediction_team")
|
73 |
+
}
|
74 |
+
except Exception as e:
|
75 |
+
raise HTTPException(status_code=500, detail=f"Error predicting team outcome: {str(e)}")
|
76 |
|
77 |
+
# 🔹 Utility Functions
|
78 |
def get_teams():
|
79 |
+
return {"teams": sorted(set(match_df['team1'].unique().tolist() + match_df['team2'].unique().tolist()))}
|
80 |
|
81 |
def get_players():
|
82 |
+
return {"players": sorted(set(ball_df['striker'].dropna().unique().tolist()))}
|
|
|
83 |
|
84 |
def get_seasons():
|
85 |
+
return {"seasons": ["All Seasons"] + sorted(match_df['season'].dropna().unique().tolist())}
|
86 |
|
87 |
+
# 🔹 Get Team Trends Over Time
|
88 |
def get_team_trends(team_name: str):
|
89 |
team_name = team_name.strip().title()
|
90 |
team_matches = match_df[(match_df['team1'] == team_name) | (match_df['team2'] == team_name)]
|
|
|
103 |
"season": season,
|
104 |
"wins": wins,
|
105 |
"total_matches": total_matches,
|
106 |
+
"win_percentage": round(win_percentage, 2)
|
107 |
})
|
108 |
|
109 |
+
return {"team_name": team_name, "trends": trends, "summary": generate_summary(trends, "team_trends")}
|
110 |
|
111 |
+
# 🔹 Get Player Trends Over Time
|
112 |
def get_player_trends(player_name: str, role: str = "Batting"):
|
113 |
player_name = player_name.strip().title()
|
114 |
name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
|
|
|
129 |
trends.append({
|
130 |
"season": season,
|
131 |
"total_runs": total_runs,
|
132 |
+
"strike_rate": round(strike_rate, 2),
|
133 |
"matches_played": matches_played
|
134 |
})
|
135 |
elif role == "Bowling":
|
|
|
142 |
trends.append({
|
143 |
"season": season,
|
144 |
"total_wickets": total_wickets,
|
145 |
+
"bowling_average": round(bowling_average, 2),
|
146 |
+
"economy_rate": round(economy_rate, 2),
|
147 |
"matches_played": matches_played
|
148 |
})
|
149 |
|
150 |
+
return {"player_name": player_name, "role": role, "trends": trends, "summary": generate_summary(trends, "player_trends")}
|