sivapriya175 commited on
Commit
36da710
·
0 Parent(s):

deploy backend files

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.csv filter=lfs diff=lfs merge=lfs -text
2
+ data/*.csv filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.9-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements file first (to leverage Docker caching)
8
+ COPY requirements.txt .
9
+
10
+ # Install dependencies
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy the rest of the application code
14
+ COPY . .
15
+
16
+ # Expose the port FastAPI runs on
17
+ EXPOSE 7860
18
+
19
+ # Command to run FastAPI using Uvicorn
20
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Cricket Match Prediction FastAPI
3
+ emoji: 🦀
4
+ colorFrom: pink
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ ---
__pycache__/main.cpython-310.pyc ADDED
Binary file (469 Bytes). View file
 
__pycache__/main.cpython-313.pyc ADDED
Binary file (597 Bytes). View file
 
__pycache__/routers.cpython-310.pyc ADDED
Binary file (2.32 kB). View file
 
__pycache__/routers.cpython-313.pyc ADDED
Binary file (3.43 kB). View file
 
__pycache__/services.cpython-310.pyc ADDED
Binary file (9.25 kB). View file
 
__pycache__/services.cpython-313.pyc ADDED
Binary file (17.3 kB). View file
 
data/cleaned_ball_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1263c36a2879b900f4e890d1b497bcdc4b934fd70d8ee4b4c2795a0f454f7da5
3
+ size 190499854
data/cleaned_match_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bf73513be1bf6417b86a25d78308137534724109771eb5aaa15339167476535
3
+ size 384786
data/data_clean.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # Load ball-by-ball dataset with low_memory=False to handle mixed types
4
+ ball_df = pd.read_csv('ODI_Match_Data.csv', low_memory=False)
5
+
6
+ # Clean ball-by-ball dataset
7
+ ball_df['match_id'] = ball_df['match_id'].astype(int) # Ensure match_id is an integer
8
+ ball_df['season'] = ball_df['season'].astype(str).str.strip() # Ensure season is a string, handle mixed types
9
+ ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce') # Convert to datetime, handle invalid dates
10
+ ball_df['runs_off_bat'] = ball_df['runs_off_bat'].fillna(0).astype(int)
11
+ ball_df['extras'] = ball_df['extras'].fillna(0).astype(int)
12
+ ball_df['wides'] = ball_df['wides'].fillna(0).astype(int)
13
+ ball_df['noballs'] = ball_df['noballs'].fillna(0).astype(int)
14
+ ball_df['byes'] = ball_df['byes'].fillna(0).astype(int)
15
+ ball_df['legbyes'] = ball_df['legbyes'].fillna(0).astype(int)
16
+ ball_df['penalty'] = ball_df['penalty'].fillna(0).astype(int)
17
+ ball_df['wicket_type'] = ball_df['wicket_type'].notna().astype(int) # 1 if wicket, 0 if not
18
+ ball_df['batting_team'] = ball_df['batting_team'].astype(str).str.strip().str.title() # Ensure string, then title case
19
+ ball_df['bowling_team'] = ball_df['bowling_team'].astype(str).str.strip().str.title()
20
+ ball_df['striker'] = ball_df['striker'].astype(str).str.strip().str.title()
21
+ ball_df['non_striker'] = ball_df['non_striker'].astype(str).str.strip().str.title()
22
+ ball_df['bowler'] = ball_df['bowler'].astype(str).str.strip().str.title()
23
+ ball_df['player_dismissed'] = ball_df['player_dismissed'].astype(str).str.strip().str.title() # Ensure string, then title case
24
+
25
+ # Handle 'other_player_dismissed' - check if it's numeric or non-string, convert to string if possible
26
+ if ball_df['other_player_dismissed'].dtype != 'object': # If not string type
27
+ ball_df['other_player_dismissed'] = ball_df['other_player_dismissed'].astype(str).str.strip().str.title()
28
+ else:
29
+ # If already object type, handle NaN or non-string values
30
+ ball_df['other_player_dismissed'] = ball_df['other_player_dismissed'].fillna('').astype(str).str.strip().str.title()
31
+
32
+ ball_df['other_wicket_type'] = ball_df['other_wicket_type'].astype(str).str.strip() # Ensure string, handle as is
33
+
34
+ # Extract venue if needed (assuming start_date might contain venue or it's separate)
35
+ if 'venue' not in ball_df.columns:
36
+ if 'start_date' in ball_df.columns and ball_df['start_date'].dtype == 'object':
37
+ ball_df['venue'] = ball_df['start_date'].str.extract(r', (.+)$').fillna('N/A')
38
+ else:
39
+ ball_df['venue'] = 'N/A' # Default if venue isn’t available
40
+
41
+ # Calculate total runs (including extras)
42
+ ball_df['total_runs'] = ball_df['runs_off_bat'] + ball_df['extras']
43
+
44
+ # Save cleaned ball-by-ball dataset
45
+ ball_df.to_csv('cleaned_ball_data.csv', index=False)
46
+ print("Cleaned ball-by-ball dataset saved as 'cleaned_ball_data.csv'")
main.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from routers import router
3
+
4
+ app = FastAPI(title="Cricket Statistics API")
5
+
6
+ app.include_router(router)
7
+
8
+ @app.get("/")
9
+ async def root():
10
+ return {"message": "Welcome to the Cricket Statistics API"}
models/__pycache__/train_model.cpython-310.pyc ADDED
Binary file (9.49 kB). View file
 
models/__pycache__/train_model.cpython-313.pyc ADDED
Binary file (19.6 kB). View file
 
models/train_model.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingRegressor
4
+ from sklearn.multioutput import MultiOutputRegressor
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
7
+ from sklearn.preprocessing import StandardScaler
8
+ import joblib
9
+
10
+ # Load datasets with exact column names
11
+ ball_df = pd.read_csv('data/cleaned_ball_data.csv',
12
+ dtype={
13
+ 'match_id': str, 'season': str, 'start_date': str, 'venue': str,
14
+ 'innings': int, 'ball': float, 'batting_team': str, 'bowling_team': str,
15
+ 'striker': str, 'non_striker': str, 'bowler': str, 'runs_off_bat': int,
16
+ 'extras': int, 'wides': float, 'noballs': float, 'byes': float,
17
+ 'legbyes': float, 'penalty': float, 'wicket_type': str,
18
+ 'player_dismissed': str, 'other_wicket_type': str,
19
+ 'other_player_dismissed': str, 'cricsheet_id': str, 'total_runs': int
20
+ })
21
+ match_df = pd.read_csv('data/cleaned_match_data.csv',
22
+ dtype={
23
+ 'id': str, 'season': str, 'city': str, 'date': str,
24
+ 'team1': str, 'team2': str, 'toss_winner': str, 'toss_decision': str,
25
+ 'result': str, 'dl_applied': int, 'winner': str,
26
+ 'win_by_runs': float, 'win_by_wickets': float, 'player_of_match': str,
27
+ 'venue': str, 'umpire1': str, 'umpire2': str, 'umpire3': str
28
+ })
29
+
30
+ # Convert date columns to datetime
31
+ match_df['date'] = pd.to_datetime(match_df['date'], errors='coerce')
32
+ ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce')
33
+
34
+ # Filter for ODI matches (proxy based on date range; adjust as needed)
35
+ odi_date_mask = (match_df['date'].dt.year >= 2015) & (match_df['date'].dt.year <= 2022)
36
+ match_df = match_df[odi_date_mask].copy()
37
+
38
+ # Compute team total scores from ball_df
39
+ team_scores = ball_df.groupby(['match_id', 'batting_team'])['total_runs'].sum().reset_index()
40
+ team_scores.rename(columns={'total_runs': 'team_total'}, inplace=True)
41
+
42
+ # Merge computed scores into match_df with better handling for missing data
43
+ match_df = match_df.merge(team_scores, left_on=['id', 'team1'], right_on=['match_id', 'batting_team'], how='left')
44
+ match_df.rename(columns={'team_total': 'team1_total'}, inplace=True)
45
+ match_df['team1_total'] = match_df['team1_total'].fillna(match_df['team1_total'].mean())
46
+ match_df = match_df.merge(team_scores, left_on=['id', 'team2'], right_on=['match_id', 'batting_team'], how='left')
47
+ match_df.rename(columns={'team_total': 'team2_total'}, inplace=True)
48
+ match_df['team2_total'] = match_df['team2_total'].fillna(match_df['team2_total'].mean())
49
+
50
+ # Drop extra columns created during merging
51
+ match_df.drop(columns=['batting_team', 'match_id'], errors='ignore', inplace=True)
52
+
53
+ # Add venue and city indices
54
+ match_df['venue_index'] = match_df['venue'].astype('category').cat.codes
55
+ match_df['city_index'] = match_df['city'].astype('category').cat.codes
56
+
57
+ # Add toss features
58
+ match_df['toss_winner_index'] = match_df['toss_winner'].astype('category').cat.codes
59
+ match_df['toss_decision_index'] = match_df['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
60
+
61
+ # Compute historical win rates for each team (weighted by recency)
62
+ match_df['date_numeric'] = (match_df['date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1d')
63
+ max_date = match_df['date_numeric'].max()
64
+ team1_wins = match_df[match_df['winner'] == match_df['team1']].groupby('team1').agg({'date_numeric': 'mean', 'id': 'count'}).reset_index()
65
+ team1_wins.rename(columns={'id': 'wins', 'date_numeric': 'win_date', 'team1': 'team'}, inplace=True)
66
+ team2_wins = match_df[match_df['winner'] == match_df['team2']].groupby('team2').agg({'date_numeric': 'mean', 'id': 'count'}).reset_index()
67
+ team2_wins.rename(columns={'id': 'wins', 'date_numeric': 'win_date', 'team2': 'team'}, inplace=True)
68
+ team_wins = pd.concat([team1_wins, team2_wins]).groupby('team').agg({'wins': 'sum', 'win_date': 'mean'}).reset_index()
69
+ team1_matches = match_df.groupby('team1').size().reset_index(name='matches')
70
+ team1_matches.rename(columns={'team1': 'team'}, inplace=True)
71
+ team2_matches = match_df.groupby('team2').size().reset_index(name='matches')
72
+ team2_matches.rename(columns={'team2': 'team'}, inplace=True)
73
+ team_matches = pd.concat([team1_matches, team2_matches]).groupby('team')['matches'].sum().reset_index()
74
+ team_win_rates = team_matches.merge(team_wins, on='team', how='left').fillna(0)
75
+ team_win_rates['weighted_wins'] = team_win_rates.apply(lambda x: x['wins'] * np.exp(-0.1 * (max_date - x['win_date']) / 365) if pd.notna(x['win_date']) else 0, axis=1)
76
+ team_win_rates['win_rate'] = team_win_rates['weighted_wins'] / team_win_rates['matches']
77
+ team_win_rates['win_rate'] = team_win_rates['win_rate'].fillna(0)
78
+ match_df = match_df.merge(team_win_rates[['team', 'win_rate']].rename(columns={'team': 'team1', 'win_rate': 'team1_win_rate'}), on='team1', how='left')
79
+ match_df = match_df.merge(team_win_rates[['team', 'win_rate']].rename(columns={'team': 'team2', 'win_rate': 'team2_win_rate'}), on='team2', how='left')
80
+
81
+ # Compute head-to-head win rates with minimum match threshold
82
+ head_to_head = match_df[match_df['team1'].isin(match_df['team1'].unique()) & match_df['team2'].isin(match_df['team2'].unique())]
83
+ head_to_head_wins = head_to_head[head_to_head['winner'] == head_to_head['team1']].groupby(['team1', 'team2']).size().reset_index(name='h2h_wins')
84
+ head_to_head_matches = head_to_head.groupby(['team1', 'team2']).size().reset_index(name='h2h_matches')
85
+ h2h_win_rates = head_to_head_matches.merge(head_to_head_wins, on=['team1', 'team2'], how='left').fillna(0)
86
+ h2h_win_rates = h2h_win_rates[head_to_head_matches['h2h_matches'] >= 1]
87
+ h2h_win_rates['h2h_win_rate'] = h2h_win_rates['h2h_wins'] / h2h_win_rates['h2h_matches']
88
+ match_df = match_df.merge(h2h_win_rates[['team1', 'team2', 'h2h_win_rate']], on=['team1', 'team2'], how='left').fillna(0)
89
+
90
+ # Cap outliers in target variables
91
+ match_df['team1_total'] = match_df['team1_total'].clip(upper=500)
92
+ match_df['team2_total'] = match_df['team2_total'].clip(upper=500)
93
+
94
+ # Train Team Performance Prediction Model
95
+ def train_team_performance_model():
96
+ data = match_df[['team1', 'team2', 'winner', 'team1_total', 'team2_total', 'venue_index', 'city_index',
97
+ 'toss_winner_index', 'toss_decision_index', 'dl_applied', 'team1_win_rate',
98
+ 'team2_win_rate', 'h2h_win_rate']].dropna()
99
+
100
+ # Convert categorical teams to numerical indices
101
+ data['team1_index'] = data['team1'].astype('category').cat.codes
102
+ data['team2_index'] = data['team2'].astype('category').cat.codes
103
+ data['winner_index'] = (data['winner'] == data['team1']).astype(int)
104
+
105
+ # Features and targets
106
+ X = pd.DataFrame()
107
+ X['team1_index'] = data['team1_index']
108
+ X['team2_index'] = data['team2_index']
109
+ X['venue_index'] = data['venue_index']
110
+ X['city_index'] = data['city_index']
111
+ X['toss_winner_index'] = data['toss_winner_index']
112
+ X['toss_decision_index'] = data['toss_decision_index']
113
+ X['dl_applied'] = data['dl_applied']
114
+ X['team1_win_rate'] = data['team1_win_rate']
115
+ X['team2_win_rate'] = data['team2_win_rate']
116
+ X['h2h_win_rate'] = data['h2h_win_rate'] * 2 # Double weight to head-to-head
117
+
118
+ y_win = data['winner_index']
119
+ y_score = data[['team1_total', 'team2_total']]
120
+
121
+ # Scale numerical features
122
+ scaler = StandardScaler()
123
+ scaled_features = scaler.fit_transform(X[['venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index',
124
+ 'dl_applied', 'team1_win_rate', 'team2_win_rate', 'h2h_win_rate']])
125
+ X_scaled = pd.DataFrame(scaled_features, columns=['venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index',
126
+ 'dl_applied', 'team1_win_rate', 'team2_win_rate', 'h2h_win_rate'])
127
+ X_scaled['team1_index'] = X['team1_index']
128
+ X_scaled['team2_index'] = X['team2_index']
129
+
130
+ # Train/test split for win prediction
131
+ X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_win, test_size=0.2, random_state=42)
132
+
133
+ # Train RandomForestClassifier with tuned hyperparameters
134
+ win_model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, class_weight='balanced')
135
+ win_model.fit(X_train, y_train)
136
+
137
+ # Evaluate
138
+ y_pred = win_model.predict(X_test)
139
+ accuracy = accuracy_score(y_test, y_pred)
140
+ print(f"Team Win Model Accuracy: {accuracy}")
141
+
142
+ # Train HistGradientBoostingRegressor with MultiOutputRegressor for score prediction
143
+ base_score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
144
+ score_model = MultiOutputRegressor(base_score_model)
145
+ score_model.fit(X_scaled, y_score)
146
+
147
+ # Evaluate score model
148
+ y_score_pred = score_model.predict(X_scaled)
149
+ mse = mean_squared_error(y_score, y_score_pred)
150
+ r2 = r2_score(y_score, y_score_pred)
151
+ print(f"Team Score Model MSE: {mse}, R²: {r2}")
152
+
153
+ # Save models and scaler
154
+ joblib.dump((win_model, score_model, data, scaler), 'models/team_performance_predictor.pkl')
155
+
156
+ # Train Player Score Prediction Model
157
+ def train_player_score_model():
158
+ # Aggregate player runs per match from ball_df, including batting_team
159
+ player_runs = ball_df.groupby(['match_id', 'striker', 'batting_team'])['runs_off_bat'].sum().reset_index()
160
+ player_runs.rename(columns={'runs_off_bat': 'player_total'}, inplace=True)
161
+
162
+ # Merge with match_df to get match context
163
+ player_data = player_runs.merge(match_df, left_on='match_id', right_on='id', how='left')
164
+
165
+ # Feature engineering for player performance
166
+ player_data['player_avg'] = player_data.groupby('striker')['player_total'].transform('mean') # Average runs per player
167
+ player_data['team_win_rate'] = player_data.apply(lambda x: player_data[player_data['team1'] == x['batting_team']]['team1_win_rate'].mean()
168
+ if x['batting_team'] == x['team1'] else player_data[player_data['team2'] == x['batting_team']]['team2_win_rate'].mean(), axis=1)
169
+ player_data['venue_index'] = player_data['venue'].astype('category').cat.codes
170
+ player_data['city_index'] = player_data['city'].astype('category').cat.codes
171
+ player_data['toss_winner_index'] = player_data['toss_winner'].astype('category').cat.codes
172
+ player_data['toss_decision_index'] = player_data['toss_decision'].map({'bat': 1, 'field': 0}).fillna(0).astype(int)
173
+
174
+ # Features and target
175
+ X = player_data[['player_avg', 'team_win_rate', 'venue_index', 'city_index', 'toss_winner_index', 'toss_decision_index']].dropna()
176
+ y = player_data.loc[X.index, 'player_total']
177
+
178
+ # Scale numerical features
179
+ scaler = StandardScaler()
180
+ X_scaled = scaler.fit_transform(X)
181
+
182
+ # Train/test split
183
+ X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
184
+
185
+ # Train HistGradientBoostingRegressor
186
+ score_model = HistGradientBoostingRegressor(random_state=42, learning_rate=0.1, max_iter=100)
187
+ score_model.fit(X_train, y_train)
188
+
189
+ # Evaluate
190
+ y_pred = score_model.predict(X_test)
191
+ mse = mean_squared_error(y_test, y_pred)
192
+ r2 = r2_score(y_test, y_pred)
193
+ print(f"Player Score Model MSE: {mse}, R²: {r2}")
194
+
195
+ # Save model and scaler
196
+ joblib.dump((score_model, scaler, player_data), 'models/player_score_predictor.pkl')
197
+
198
+ # Predict Player Score
199
+ def predict_player_score(player: str, team: str, opponent: str, venue: str = None, city: str = None,
200
+ toss_winner: str = None, toss_decision: str = None):
201
+ try:
202
+ score_model, scaler, player_data = joblib.load('models/player_score_predictor.pkl')
203
+
204
+ if player not in player_data['striker'].values or team not in player_data['batting_team'].values:
205
+ raise ValueError("Player or team not found in training data")
206
+
207
+ # Compute player average from historical data
208
+ player_avg = player_data[player_data['striker'] == player]['player_total'].mean()
209
+ team_win_rate = player_data[player_data['batting_team'] == team]['team_win_rate'].mean()
210
+
211
+ # Use specific values if provided, otherwise default to mean
212
+ venue_index = player_data[player_data['venue'] == venue]['venue_index'].values[0] if venue else player_data['venue_index'].mean()
213
+ city_index = player_data[player_data['city'] == city]['city_index'].values[0] if city else player_data['city_index'].mean()
214
+ toss_winner_index = player_data[player_data['toss_winner'] == toss_winner]['toss_winner_index'].values[0] if toss_winner else player_data['toss_winner_index'].mean()
215
+ toss_decision_index = 1 if toss_decision == 'bat' else 0 if toss_decision == 'field' else player_data['toss_decision_index'].mean()
216
+
217
+ # Scale features
218
+ features = scaler.transform([[player_avg, team_win_rate, venue_index, city_index, toss_winner_index, toss_decision_index]])
219
+ predicted_score = score_model.predict(features)[0]
220
+
221
+ return {
222
+ "player": player,
223
+ "team": team,
224
+ "opponent": opponent,
225
+ "expected_score": round(predicted_score, 2)
226
+ }
227
+ except Exception as e:
228
+ print(f"Prediction error: {str(e)}")
229
+ return {
230
+ "player": player,
231
+ "team": team,
232
+ "opponent": opponent,
233
+ "expected_score": 0.0
234
+ }
235
+
236
+ # Predict Team Win Percentage & Expected Score with debugging
237
+ def predict_team_performance(team1: str, team2: str, venue: str = None, city: str = None,
238
+ toss_winner: str = None, toss_decision: str = None):
239
+ try:
240
+ win_model, score_model, data, scaler = joblib.load('models/team_performance_predictor.pkl')
241
+
242
+ if team1 not in data['team1'].values or team2 not in data['team2'].values:
243
+ raise ValueError("Team not found in training data")
244
+
245
+ # Get team indices
246
+ team1_index = data[data['team1'] == team1]['team1_index'].values[0]
247
+ team2_index = data[data['team2'] == team2]['team2_index'].values[0]
248
+
249
+ # Use specific values if provided, otherwise default to mean
250
+ venue_index = data[data['venue'] == venue]['venue_index'].values[0] if venue else data['venue_index'].mean()
251
+ city_index = data[data['city'] == city]['city_index'].values[0] if city else data['city_index'].mean()
252
+ toss_winner_index = data[data['toss_winner'] == toss_winner]['toss_winner_index'].values[0] if toss_winner else data['toss_winner_index'].mean()
253
+ toss_decision_index = 1 if toss_decision == 'bat' else 0 if toss_decision == 'field' else data['toss_decision_index'].mean()
254
+ dl_applied = 0 if pd.isna(toss_decision) else data['dl_applied'].mean()
255
+ team1_win_rate = data[data['team1'] == team1]['team1_win_rate'].values[0]
256
+ team2_win_rate = data[data['team2'] == team2]['team2_win_rate'].values[0]
257
+ h2h_win_rate = data[(data['team1'] == team1) & (data['team2'] == team2)]['h2h_win_rate'].values[0] if not data[(data['team1'] == team1) & (data['team2'] == team2)].empty else 0
258
+
259
+ # Debug head-to-head and win rates
260
+ print(f"Team1: {team1}, Team2: {team2}, h2h_win_rate: {h2h_win_rate}, team1_win_rate: {team1_win_rate}, team2_win_rate: {team2_win_rate}")
261
+
262
+ # Scale features
263
+ features = scaler.transform([[venue_index, city_index, toss_winner_index, toss_decision_index, dl_applied,
264
+ team1_win_rate, team2_win_rate, h2h_win_rate]])
265
+ win_probability = win_model.predict_proba([[team1_index, team2_index, features[0][0], features[0][1],
266
+ features[0][2], features[0][3], features[0][4], features[0][5],
267
+ features[0][6], features[0][7]]])[:, 1][0] * 100
268
+ predicted_scores = score_model.predict([[team1_index, team2_index, features[0][0], features[0][1],
269
+ features[0][2], features[0][3], features[0][4], features[0][5],
270
+ features[0][6], features[0][7]]])[0]
271
+
272
+ if np.isnan(predicted_scores[0]) or np.isnan(predicted_scores[1]):
273
+ print(f"Warning: Predicted scores are NaN for {team1} vs {team2}")
274
+
275
+ return {
276
+ "team1": team1,
277
+ "team2": team2,
278
+ "win_probability_team1": round(win_probability, 2),
279
+ "expected_team1_score": round(predicted_scores[0], 2),
280
+ "expected_team2_score": round(predicted_scores[1], 2)
281
+ }
282
+ except Exception as e:
283
+ print(f"Prediction error: {str(e)}")
284
+ return {
285
+ "team1": team1,
286
+ "team2": team2,
287
+ "win_probability_team1": 50.0,
288
+ "expected_team1_score": 0.0,
289
+ "expected_team2_score": 0.0
290
+ }
291
+
292
+ # Train the models
293
+ if __name__ == "__main__":
294
+ train_team_performance_model()
295
+ train_player_score_model()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pandas
4
+ numpy
5
+ groq
routers.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException
2
+ from services import (
3
+ get_player_stats,
4
+ get_team_stats,
5
+ get_match_history,
6
+ get_teams,
7
+ get_players,
8
+ get_seasons,
9
+ predict_score,
10
+ predict_team_outcome,
11
+ get_team_trends, # New function for team trends
12
+ get_player_trends, # New function for player trends
13
+ )
14
+
15
+ router = APIRouter()
16
+
17
+ @router.get("/player-stats/")
18
+ async def player_statistics(player_name: str, season: str = None, role: str = "Batting"):
19
+ # Validate role parameter
20
+ if role not in ["Batting", "Bowling"]:
21
+ raise HTTPException(status_code=400, detail="Role must be 'Batting' or 'Bowling'")
22
+ return get_player_stats(player_name, season, role)
23
+
24
+ @router.get("/team-stats/")
25
+ async def team_statistics(team_name: str, season: str = None):
26
+ return get_team_stats(team_name, season)
27
+
28
+ @router.get("/match-history/")
29
+ async def match_history(team1: str, team2: str, season: str = None):
30
+ # Update get_match_history to handle two teams
31
+ return get_match_history(team1, team2, season)
32
+
33
+ @router.get("/teams")
34
+ async def teams():
35
+ return get_teams()
36
+
37
+ @router.get("/players")
38
+ async def players():
39
+ return get_players()
40
+
41
+ @router.get("/seasons")
42
+ async def seasons():
43
+ return get_seasons()
44
+
45
+ @router.get("/predict-score/")
46
+ async def predict_player_score_endpoint(player_name: str, opposition_team: str):
47
+ return predict_score(player_name, opposition_team)
48
+
49
+ @router.get("/predict-team-outcome/")
50
+ async def predict_team_outcome_endpoint(team1: str, team2: str):
51
+ return predict_team_outcome(team1, team2)
52
+
53
+ # New endpoint for team trends over time
54
+ @router.get("/team-trends/")
55
+ async def team_trends(team_name: str):
56
+ return get_team_trends(team_name)
57
+
58
+ # New endpoint for player trends over time
59
+ @router.get("/player-trends/")
60
+ async def player_trends(player_name: str, role: str = "Batting"):
61
+ # Validate role parameter
62
+ if role not in ["Batting", "Bowling"]:
63
+ raise HTTPException(status_code=400, detail="Role must be 'Batting' or 'Bowling'")
64
+ return get_player_trends(player_name, role)
services.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from fastapi import HTTPException
4
+ from models.train_model import predict_player_score, predict_team_performance
5
+ from groq import Groq
6
+
7
+ # Initialize Groq client with your API key
8
+ GROQ_API_KEY = "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"
9
+ client = Groq(api_key=GROQ_API_KEY)
10
+
11
+ # Load datasets
12
+ match_df = pd.read_csv('data/cleaned_match_data.csv')
13
+ match_df['date'] = pd.to_datetime(match_df['date'], errors='coerce')
14
+ ball_df = pd.read_csv('data/cleaned_ball_data.csv', low_memory=False)
15
+
16
+ # Create a player-team mapping based on ball_df
17
+ player_team_mapping = ball_df.groupby('striker')['batting_team'].agg(lambda x: x.mode()[0] if len(x.mode()) > 0 else None).to_dict()
18
+
19
+ # Function to clean JSON data
20
+ def clean_json(data):
21
+ if isinstance(data, dict):
22
+ return {k: clean_json(v) for k, v in data.items()}
23
+ elif isinstance(data, list):
24
+ return [clean_json(v) for v in data]
25
+ elif isinstance(data, float):
26
+ return 0.0 if pd.isna(data) or np.isinf(data) else data
27
+ elif pd.isna(data):
28
+ return None
29
+ elif isinstance(data, pd.Timestamp):
30
+ return data.strftime('%Y-%m-%d') if pd.notna(data) else None
31
+ elif isinstance(data, (int, bool)):
32
+ return data
33
+ return str(data)
34
+
35
+ # LLM summary generation function using Groq
36
+ def generate_summary(data, context_type):
37
+ prompt = ""
38
+ if context_type == "player_stats":
39
+ prompt = f"Summarize this player data in one sentence: {data}"
40
+ elif context_type == "team_stats":
41
+ prompt = f"Summarize this team data in one sentence: {data}"
42
+ elif context_type == "match_history":
43
+ prompt = f"Summarize this match history between {data['team1']} and {data['team2']} in one sentence: {data['matches']}"
44
+ elif context_type == "prediction_score":
45
+ prompt = f"Summarize this prediction in one sentence: {data}"
46
+ elif context_type == "prediction_team":
47
+ prompt = f"Summarize this team prediction in one sentence: {data}"
48
+
49
+ try:
50
+ chat_completion = client.chat.completions.create(
51
+ model="mixtral-8x7b-32768",
52
+ messages=[
53
+ {"role": "system", "content": "You are a concise cricket analyst."},
54
+ {"role": "user", "content": prompt}
55
+ ],
56
+ max_tokens=50,
57
+ temperature=0.7
58
+ )
59
+ summary = chat_completion.choices[0].message.content.strip()
60
+ return summary
61
+ except Exception as e:
62
+ return f"Summary unavailable due to error: {str(e)}"
63
+
64
+ # Player statistics with name variation handling
65
+ def get_player_stats(player_name: str, season: str = None, role: str = "Batting"):
66
+ player_name = player_name.strip().title()
67
+ name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
68
+ player_data = ball_df[ball_df['striker'].isin(name_variations) | ball_df['bowler'].isin(name_variations)]
69
+ if season and 'season' in ball_df.columns:
70
+ player_data = player_data[player_data['season'] == season]
71
+ if player_data.empty:
72
+ raise HTTPException(status_code=404, detail=f"Player '{player_name}' not found. Variations tried: {name_variations}")
73
+
74
+ if role == "Batting":
75
+ batting_data = player_data[player_data['striker'].isin(name_variations)]
76
+ total_runs = int(batting_data['runs_off_bat'].sum())
77
+ balls_faced = int(batting_data.shape[0])
78
+ strike_rate = float((total_runs / balls_faced * 100) if balls_faced > 0 else 0)
79
+ matches_played = int(len(batting_data['match_id'].unique()))
80
+
81
+ stats = {
82
+ "player_name": player_name,
83
+ "role": role,
84
+ "total_runs": total_runs,
85
+ "balls_faced": balls_faced,
86
+ "strike_rate": strike_rate,
87
+ "matches_played": matches_played,
88
+ "season": season if season else "All Seasons"
89
+ }
90
+ stats["summary"] = generate_summary(stats, "player_stats")
91
+ return clean_json(stats)
92
+
93
+ elif role == "Bowling":
94
+ bowling_data = player_data[player_data['bowler'].isin(name_variations)]
95
+ bowler_wicket_types = ["caught", "bowled", "lbw", "caught and bowled", "hit wicket"]
96
+ wickets_data = bowling_data[bowling_data['player_dismissed'].notna() &
97
+ bowling_data['wicket_type'].isin(bowler_wicket_types)]
98
+ total_wickets = int(wickets_data.shape[0])
99
+ total_runs_conceded = int(bowling_data['total_runs'].sum())
100
+ total_balls_bowled = int(bowling_data.shape[0])
101
+ total_overs_bowled = float(total_balls_bowled / 6)
102
+ bowling_average = float(total_runs_conceded / total_wickets) if total_wickets > 0 else float('inf')
103
+ economy_rate = float(total_runs_conceded / total_overs_bowled) if total_overs_bowled > 0 else 0
104
+ bowling_strike_rate = float(total_balls_bowled / total_wickets) if total_wickets > 0 else float('inf')
105
+ bowling_matches = int(len(bowling_data['match_id'].unique()))
106
+
107
+ stats = {
108
+ "player_name": player_name,
109
+ "role": role,
110
+ "total_wickets": total_wickets,
111
+ "bowling_average": 0.0 if np.isinf(bowling_average) else round(bowling_average, 2),
112
+ "economy_rate": round(economy_rate, 2),
113
+ "bowling_strike_rate": 0.0 if np.isinf(bowling_strike_rate) else round(bowling_strike_rate, 2),
114
+ "overs_bowled": round(total_overs_bowled, 1),
115
+ "bowling_matches": bowling_matches,
116
+ "season": season if season else "All Seasons"
117
+ }
118
+ stats["summary"] = generate_summary(stats, "player_stats")
119
+ return clean_json(stats)
120
+
121
+ # Team statistics
122
+ def get_team_stats(team_name: str, season: str = None):
123
+ team_name = team_name.strip().title()
124
+ team_matches = match_df[(match_df['team1'] == team_name) | (match_df['team2'] == team_name)]
125
+ if season and 'season' in match_df.columns:
126
+ team_matches = team_matches[team_matches['season'] == season]
127
+ if team_matches.empty:
128
+ raise HTTPException(status_code=404, detail="Team not found")
129
+
130
+ wins = int(team_matches[team_matches['winner'] == team_name].shape[0])
131
+ total_matches = int(team_matches.shape[0])
132
+
133
+ stats = {
134
+ "total_matches": total_matches,
135
+ "wins": wins,
136
+ "losses": total_matches - wins,
137
+ "win_percentage": float((wins / total_matches * 100) if total_matches > 0 else 0),
138
+ "season": season if season else "All Seasons"
139
+ }
140
+ stats["summary"] = generate_summary(stats, "team_stats")
141
+ return clean_json(stats)
142
+
143
+ # Match History Retrieval
144
+ def get_match_history(team1: str, team2: str, season: str = None):
145
+ team1 = team1.strip().title()
146
+ team2 = team2.strip().title()
147
+ available_teams = set(match_df['team1'].unique().tolist() + match_df['team2'].unique().tolist())
148
+ if team1 not in available_teams or team2 not in available_teams:
149
+ raise HTTPException(status_code=404, detail=f"Team {team1 if team1 not in available_teams else team2} not found.")
150
+
151
+ team_matches = match_df[
152
+ ((match_df['team1'] == team1) & (match_df['team2'] == team2)) |
153
+ ((match_df['team1'] == team2) & (match_df['team2'] == team1))
154
+ ].copy()
155
+ if season and 'season' in match_df.columns:
156
+ team_matches = team_matches[team_matches['season'] == season]
157
+ if team_matches.empty:
158
+ raise HTTPException(status_code=404, detail=f"No match history found between {team1} and {team2}.")
159
+
160
+ team_matches['date'] = team_matches['date'].apply(lambda x: x.strftime('%Y-%m-%d') if pd.notna(x) else None)
161
+ team_matches['winner'] = team_matches['winner'].fillna("Draw")
162
+ for column in ['team1', 'team2', 'winner']:
163
+ team_matches[column] = team_matches[column].apply(lambda x: str(x) if pd.notna(x) else None)
164
+ history = team_matches[['date', 'team1', 'team2', 'winner']].to_dict(orient='records')
165
+
166
+ response = {
167
+ "team1": team1,
168
+ "team2": team2,
169
+ "season": season if season else "All Seasons",
170
+ "matches": history
171
+ }
172
+ response["summary"] = generate_summary(response, "match_history")
173
+ return clean_json(response)
174
+
175
+ # Prediction functions
176
+ def predict_score(player_name: str, opposition_team: str):
177
+ try:
178
+ # Handle name variations
179
+ player_name = player_name.strip().replace("+", " ").title()
180
+ name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
181
+ player_team = None
182
+ for name in name_variations:
183
+ if name in player_team_mapping:
184
+ player_team = player_team_mapping[name]
185
+ player_name = name # Use the matched name
186
+ break
187
+ if not player_team:
188
+ raise ValueError(f"Player {player_name} not found in historical data")
189
+
190
+ # Debug: Print arguments before calling predict_player_score
191
+ print(f"Calling predict_player_score with: player={player_name}, team={player_team}, opponent={opposition_team}")
192
+
193
+ predicted_runs = predict_player_score(
194
+ player=player_name,
195
+ team=player_team,
196
+ opponent=opposition_team,
197
+ venue=None,
198
+ city=None,
199
+ toss_winner=None,
200
+ toss_decision=None
201
+ )
202
+ stats = {
203
+ "player": player_name,
204
+ "team": player_team,
205
+ "opposition": opposition_team,
206
+ "predicted_runs": predicted_runs["expected_score"]
207
+ }
208
+ stats["summary"] = generate_summary(stats, "prediction_score")
209
+ return clean_json(stats)
210
+ except Exception as e:
211
+ raise HTTPException(status_code=500, detail=f"Error predicting score for {player_name} against {opposition_team}: {str(e)}")
212
+
213
+ def predict_team_outcome(team1: str, team2: str):
214
+ prediction = predict_team_performance(team1, team2)
215
+ prediction["summary"] = generate_summary(prediction, "prediction_team")
216
+ return clean_json(prediction)
217
+
218
+ # Utility functions
219
+ def get_teams():
220
+ return clean_json({"teams": sorted(set(match_df['team1'].unique().tolist() + match_df['team2'].unique().tolist()))})
221
+
222
+ def get_players():
223
+ unique_players = sorted(set(ball_df['striker'].dropna().unique().tolist()))
224
+ return clean_json({"players": unique_players})
225
+
226
+ def get_seasons():
227
+ return clean_json({"seasons": ["All Seasons"] + sorted(match_df['season'].dropna().unique().tolist())})
228
+
229
+ # New function for team trends over time
230
+ def get_team_trends(team_name: str):
231
+ team_name = team_name.strip().title()
232
+ team_matches = match_df[(match_df['team1'] == team_name) | (match_df['team2'] == team_name)]
233
+ if team_matches.empty:
234
+ raise HTTPException(status_code=404, detail="Team not found")
235
+
236
+ # Calculate win percentage and other metrics for each season
237
+ trends = []
238
+ for season in match_df['season'].unique():
239
+ season_matches = team_matches[team_matches['season'] == season]
240
+ if not season_matches.empty:
241
+ wins = season_matches[season_matches['winner'] == team_name].shape[0]
242
+ total_matches = season_matches.shape[0]
243
+ win_percentage = (wins / total_matches * 100) if total_matches > 0 else 0
244
+ trends.append({
245
+ "season": season,
246
+ "wins": wins,
247
+ "total_matches": total_matches,
248
+ "win_percentage": win_percentage
249
+ })
250
+
251
+ return {"team_name": team_name, "trends": trends}
252
+
253
+ # New function for player trends over time
254
+ def get_player_trends(player_name: str, role: str = "Batting"):
255
+ player_name = player_name.strip().title()
256
+ name_variations = [player_name, player_name.replace(" ", ""), " ".join(reversed(player_name.split()))]
257
+ player_data = ball_df[ball_df['striker'].isin(name_variations) | ball_df['bowler'].isin(name_variations)]
258
+ if player_data.empty:
259
+ raise HTTPException(status_code=404, detail=f"Player '{player_name}' not found")
260
+
261
+ # Calculate performance metrics for each season
262
+ trends = []
263
+ for season in ball_df['season'].unique():
264
+ season_data = player_data[player_data['season'] == season]
265
+ if not season_data.empty:
266
+ if role == "Batting":
267
+ total_runs = int(season_data['runs_off_bat'].sum())
268
+ balls_faced = int(season_data.shape[0])
269
+ strike_rate = float((total_runs / balls_faced * 100) if balls_faced > 0 else 0)
270
+ matches_played = int(len(season_data['match_id'].unique()))
271
+ trends.append({
272
+ "season": season,
273
+ "total_runs": total_runs,
274
+ "strike_rate": strike_rate,
275
+ "matches_played": matches_played
276
+ })
277
+ elif role == "Bowling":
278
+ total_wickets = int(season_data[season_data['wicket_type'].notna()].shape[0])
279
+ total_runs_conceded = int(season_data['total_runs'].sum())
280
+ total_overs_bowled = float(season_data.shape[0] / 6)
281
+ bowling_average = float(total_runs_conceded / total_wickets) if total_wickets > 0 else float('inf')
282
+ economy_rate = float(total_runs_conceded / total_overs_bowled) if total_overs_bowled > 0 else 0
283
+ matches_played = int(len(season_data['match_id'].unique()))
284
+ trends.append({
285
+ "season": season,
286
+ "total_wickets": total_wickets,
287
+ "bowling_average": bowling_average,
288
+ "economy_rate": economy_rate,
289
+ "matches_played": matches_played
290
+ })
291
+
292
+ return {"player_name": player_name, "role": role, "trends": trends}