ssyok commited on
Commit
713c6ae
·
1 Parent(s): 9803403

change logic to predict like and share. Before that my like is a feature which is not correct

Browse files
Data_Analytics_SHE_Course_Group_Assignment_Machine_Learning.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -1,72 +1,35 @@
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
  import joblib
5
  import os
6
 
7
- # ==============================================================================
8
- # 1. LOAD MODELS AND SCALER (This part runs once when the script starts)
9
- # ==============================================================================
10
-
11
- # Dictionary to hold the loaded model objects and a list of their names
12
- all_models = {}
13
- model_names = [
14
- 'Linear Regression', 'Ridge Regression', 'Lasso Regression',
15
- 'Random Forest', 'Gradient Boosting'
16
- ]
17
- BEST_MODEL_NAME = 'Random Forest' # Define the best model to be highlighted
18
-
19
- try:
20
- # Load all the regression models
21
- for name in model_names:
22
- # Construct the filename, e.g., 'models/random_forest.joblib'
23
- filename = f"models/{name.lower().replace(' ', '_')}.joblib"
24
- if os.path.exists(filename):
25
- all_models[name] = joblib.load(filename)
26
- else:
27
- raise FileNotFoundError(f"Model file not found: {filename}")
28
-
29
- # Load the scaler
30
- scaler_path = 'models/scaler.joblib'
31
- if os.path.exists(scaler_path):
32
- scaler = joblib.load(scaler_path)
33
- else:
34
- raise FileNotFoundError(f"Scaler file not found: {scaler_path}")
35
-
36
- models_loaded = True
37
- print("✅ All models and scaler loaded successfully!")
38
-
39
- # Get the feature names the model was trained on from the scaler
40
- expected_columns = scaler.feature_names_in_
41
- print(f"Models expect {len(expected_columns)} features.")
42
-
43
- except Exception as e:
44
- print(f"❌ ERROR: Could not load models. {e}")
45
- print("Please ensure all '.joblib' files are in the 'models/' directory.")
46
- models_loaded = False
47
- all_models = {}
48
- scaler = None
49
- expected_columns = []
50
-
51
- # ==============================================================================
52
- # 2. PREDICTION FUNCTION
53
- # ==============================================================================
54
-
55
- def predict_shares_all_models(likes, generation_time, gpu_usage, file_size_kb,
56
- width, height, style_accuracy_score,
57
- is_hand_edited, ethical_concerns_flag,
58
- day_of_week, month, hour, platform):
59
  """
60
- Performs feature engineering, predicts shares using all loaded models,
61
- and returns formatted outputs for the Gradio interface.
62
  """
63
- if not models_loaded:
64
- error_message = "Models are not loaded. Please check the console for errors."
65
- return 0, error_message, error_message
66
-
67
- # --- Step A: Perform feature engineering ---
68
  sample_data = {
69
- 'likes': likes,
70
  'style_accuracy_score': style_accuracy_score,
71
  'generation_time': generation_time,
72
  'gpu_usage': gpu_usage,
@@ -79,125 +42,236 @@ def predict_shares_all_models(likes, generation_time, gpu_usage, file_size_kb,
79
  'month': month,
80
  'hour': hour
81
  }
82
-
 
83
  sample_data['aspect_ratio'] = width / height if height > 0 else 0
84
  sample_data['total_pixels'] = width * height
85
  sample_data['is_square'] = int(width == height)
86
  sample_data['is_weekend'] = int(day_of_week >= 5)
87
-
 
88
  for p in ['Twitter', 'TikTok', 'Reddit', 'Instagram']:
89
  sample_data[f'platform_{p}'] = 1 if platform == p else 0
90
-
91
- sample_data['engagement_rate'] = likes / (sample_data['total_pixels'] / 1000000 + 1)
92
- sample_data['quality_engagement'] = style_accuracy_score * likes / 100
93
  sample_data['file_density'] = file_size_kb / (sample_data['total_pixels'] / 1000 + 1)
94
  sample_data['gpu_efficiency'] = generation_time / (gpu_usage + 1)
95
-
96
- for p in ['Twitter', 'TikTok', 'Reddit', 'Instagram']:
97
- sample_data[f'{p.lower()}_likes'] = likes * sample_data[f'platform_{p}']
98
-
99
  sample_data['month_sin'] = np.sin(2 * np.pi * month / 12)
100
  sample_data['month_cos'] = np.cos(2 * np.pi * month / 12)
101
  sample_data['day_sin'] = np.sin(2 * np.pi * day_of_week / 7)
102
  sample_data['day_cos'] = np.cos(2 * np.pi * day_of_week / 7)
103
-
104
- # --- Step B: Align columns and Scale ---
 
 
105
  sample_df = pd.DataFrame([sample_data])
106
  sample_df = sample_df.reindex(columns=expected_columns, fill_value=0)
107
- sample_scaled = scaler.transform(sample_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- # --- Step C: Predict with all models ---
110
- predictions = {}
111
- for name, model in all_models.items():
112
- pred_value = model.predict(sample_scaled)[0]
113
- predictions[name] = max(0, int(pred_value))
114
 
115
- # --- Step D: Format the outputs for Gradio ---
 
 
116
 
117
- # 1. Get the single best model prediction
118
- best_model_prediction = predictions.get(BEST_MODEL_NAME, 0)
 
 
119
 
120
- # 2. Create a Markdown table for all model predictions
121
- all_results_df = pd.DataFrame(list(predictions.items()), columns=['Model', 'Predicted Shares'])
122
- all_results_df = all_results_df.sort_values('Predicted Shares', ascending=False)
123
- all_models_table = all_results_df.to_markdown(index=False)
124
 
125
- # 3. Create a Markdown table for the engineered features
126
- features_df = sample_df.T.reset_index()
127
- features_df.columns = ['Feature', 'Value']
128
- features_df['Value'] = features_df['Value'].apply(lambda x: f"{x:.4f}" if isinstance(x, float) else x)
129
- features_table = features_df.to_markdown(index=False)
130
 
131
- return best_model_prediction, all_models_table, features_table
 
 
 
132
 
133
- # ==============================================================================
134
- # 3. GRADIO INTERFACE
135
- # ==============================================================================
136
 
 
137
  with gr.Blocks(theme=gr.themes.Soft(), title="AI Image Virality Predictor") as demo:
138
  gr.Markdown("# 🎨 AI Ghibli Image Virality Predictor")
139
- gr.Markdown("Enter image features to get a virality prediction from multiple regression models.")
140
 
141
  with gr.Row():
142
- # --- INPUTS COLUMN ---
143
  with gr.Column(scale=2):
144
- gr.Markdown("### 1. Input Features")
145
- with gr.Accordion("Core Engagement & Image Metrics", open=True):
146
- likes = gr.Slider(minimum=0, maximum=10000, value=500, step=10, label="Likes")
147
- style_accuracy_score = gr.Slider(minimum=0, maximum=100, value=85, step=1, label="Style Accuracy Score (%)")
148
- width = gr.Slider(minimum=256, maximum=2048, value=1024, step=64, label="Width (px)")
149
- height = gr.Slider(minimum=256, maximum=2048, value=1024, step=64, label="Height (px)")
150
- file_size_kb = gr.Slider(minimum=100, maximum=5000, value=1500, step=100, label="File Size (KB)")
151
-
152
- with gr.Accordion("Technical & Posting Details", open=True):
153
- generation_time = gr.Slider(minimum=1, maximum=30, value=8, step=0.5, label="Generation Time (s)")
154
- gpu_usage = gr.Slider(minimum=10, maximum=100, value=70, step=5, label="GPU Usage (%)")
155
- platform = gr.Radio(["Instagram", "Twitter", "TikTok", "Reddit"], label="Platform", value="Instagram")
156
- day_of_week = gr.Slider(minimum=0, maximum=6, value=4, step=1, label="Day of Week (0=Mon, 6=Sun)")
157
- month = gr.Slider(minimum=1, maximum=12, value=7, step=1, label="Month (1-12)")
158
- hour = gr.Slider(minimum=0, maximum=23, value=18, step=1, label="Hour of Day (0-23)")
159
- is_hand_edited = gr.Checkbox(label="Was it Hand Edited?", value=False)
160
- ethical_concerns_flag = gr.Checkbox(label="Any Ethical Concerns?", value=False)
161
-
162
- predict_btn = gr.Button("Predict Virality", variant="primary")
163
-
164
- # --- OUTPUTS COLUMN ---
 
 
 
 
 
 
 
 
 
 
 
 
165
  with gr.Column(scale=3):
166
- gr.Markdown("### 2. Prediction Results")
167
-
168
- # Highlighted Best Model Output
169
- best_model_output = gr.Number(
170
- label=f"🏆 Best Model Prediction ({BEST_MODEL_NAME})",
171
- interactive=False
172
- )
173
-
174
- # Table for All Model Predictions
175
- with gr.Accordion("Comparison of All Models", open=True):
176
- all_models_output = gr.Markdown(label="All Model Predictions")
177
-
178
- # Table for Feature Engineering Details
179
- with gr.Accordion("View Engineered Features", open=False):
180
- features_output = gr.Markdown(label="Feature Engineering Details")
 
 
 
 
 
 
 
 
181
 
182
  # Connect the button to the function
183
  predict_btn.click(
184
- fn=predict_shares_all_models,
185
  inputs=[
186
- likes, generation_time, gpu_usage, file_size_kb,
187
  width, height, style_accuracy_score,
188
  is_hand_edited, ethical_concerns_flag,
189
  day_of_week, month, hour, platform
190
  ],
191
  outputs=[
192
- best_model_output,
193
- all_models_output,
194
- features_output
 
 
195
  ]
196
  )
197
 
198
  # Launch the app
199
  if __name__ == "__main__":
 
200
  if not models_loaded:
201
  print("\nCannot launch Gradio app because models failed to load.")
202
  else:
203
- demo.launch()
 
 
 
 
1
+ # Gradio Demo App for Predicting Both Likes and Shares
2
+
3
  import gradio as gr
4
  import pandas as pd
5
  import numpy as np
6
  import joblib
7
  import os
8
 
9
+ # Best models (update based on your results)
10
+ BEST_MODEL_LIKES = 'Random Forest'
11
+ BEST_MODEL_SHARES = 'Random Forest'
12
+ models_loaded = False
13
+
14
+ # Global variables for models and scaler
15
+ all_models_likes = {}
16
+ all_models_shares = {}
17
+ model_names = []
18
+ scaler = None
19
+ expected_columns = []
20
+
21
+ # Prediction Function for Both Likes and Shares
22
+ def predict_virality_all_models(generation_time, gpu_usage, file_size_kb,
23
+ width, height, style_accuracy_score,
24
+ is_hand_edited, ethical_concerns_flag,
25
+ day_of_week, month, hour, platform):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  """
27
+ Predicts both likes and shares using all loaded models.
 
28
  """
29
+ global all_models_likes, all_models_shares, model_names, scaler, expected_columns
30
+
31
+ # Create feature dictionary (WITHOUT likes)
 
 
32
  sample_data = {
 
33
  'style_accuracy_score': style_accuracy_score,
34
  'generation_time': generation_time,
35
  'gpu_usage': gpu_usage,
 
42
  'month': month,
43
  'hour': hour
44
  }
45
+
46
+ # Perform feature engineering
47
  sample_data['aspect_ratio'] = width / height if height > 0 else 0
48
  sample_data['total_pixels'] = width * height
49
  sample_data['is_square'] = int(width == height)
50
  sample_data['is_weekend'] = int(day_of_week >= 5)
51
+
52
+ # One-hot encode platform
53
  for p in ['Twitter', 'TikTok', 'Reddit', 'Instagram']:
54
  sample_data[f'platform_{p}'] = 1 if platform == p else 0
55
+
56
+ # Technical features
 
57
  sample_data['file_density'] = file_size_kb / (sample_data['total_pixels'] / 1000 + 1)
58
  sample_data['gpu_efficiency'] = generation_time / (gpu_usage + 1)
59
+
60
+ # Temporal cyclical features (continued)
 
 
61
  sample_data['month_sin'] = np.sin(2 * np.pi * month / 12)
62
  sample_data['month_cos'] = np.cos(2 * np.pi * month / 12)
63
  sample_data['day_sin'] = np.sin(2 * np.pi * day_of_week / 7)
64
  sample_data['day_cos'] = np.cos(2 * np.pi * day_of_week / 7)
65
+ sample_data['hour_sin'] = np.sin(2 * np.pi * hour / 24)
66
+ sample_data['hour_cos'] = np.cos(2 * np.pi * hour / 24)
67
+
68
+ # Create DataFrame and align columns
69
  sample_df = pd.DataFrame([sample_data])
70
  sample_df = sample_df.reindex(columns=expected_columns, fill_value=0)
71
+
72
+ # Scale features
73
+ try:
74
+ sample_scaled = scaler.transform(sample_df)
75
+ except Exception as e:
76
+ return {}, {}, f"Error during scaling: {e}"
77
+
78
+ # Predict with all models
79
+ predictions_likes = {}
80
+ predictions_shares = {}
81
+
82
+ for name in model_names:
83
+ # Predict likes
84
+ if name in all_models_likes:
85
+ pred_likes = all_models_likes[name].predict(sample_scaled)[0]
86
+ predictions_likes[name] = max(0, int(pred_likes))
87
+
88
+ # Predict shares
89
+ if name in all_models_shares:
90
+ pred_shares = all_models_shares[name].predict(sample_scaled)[0]
91
+ predictions_shares[name] = max(0, int(pred_shares))
92
+
93
+ return predictions_likes, predictions_shares, None
94
+
95
+ def load_models():
96
+ # Load Models for Both Likes and Shares
97
+ global all_models_likes, all_models_shares, model_names, scaler, expected_columns, models_loaded
98
+
99
+ # Dictionaries to hold the loaded model objects
100
+ all_models_likes = {}
101
+ all_models_shares = {}
102
+ model_names = [
103
+ 'Linear Regression', 'Ridge Regression', 'Lasso Regression',
104
+ 'Random Forest', 'Gradient Boosting'
105
+ ]
106
+
107
+ try:
108
+ # Load all the regression models for both targets
109
+ for name in model_names:
110
+ # Load likes model
111
+ filename_likes = os.path.join("models", f"{name.lower().replace(' ', '_')}_likes.joblib")
112
+ all_models_likes[name] = joblib.load(filename_likes)
113
+
114
+ # Load shares model
115
+ filename_shares = os.path.join("models", f"{name.lower().replace(' ', '_')}_shares.joblib")
116
+ all_models_shares[name] = joblib.load(filename_shares)
117
+
118
+ print(f"Loaded: {name} (both likes and shares)")
119
+
120
+ # Load the scaler
121
+ scaler = joblib.load(os.path.join("models", "scaler.joblib"))
122
+ print("Loaded: scaler.joblib")
123
+
124
+ # Get the feature names
125
+ expected_columns = scaler.feature_names_in_
126
+ print(f"Model expects {len(expected_columns)} features.")
127
+
128
+ models_loaded = True
129
+ print("\n✅ All models and scaler loaded successfully!")
130
+
131
+ except FileNotFoundError as e:
132
+ print(f"\n❌ ERROR: Could not find a model file: {e}")
133
+ print("Please make sure all '.joblib' files are in the 'models/' directory.")
134
+ models_loaded = False
135
+
136
+
137
+ def predict_virality_gradio(generation_time, gpu_usage, file_size_kb,
138
+ width, height, style_accuracy_score,
139
+ is_hand_edited, ethical_concerns_flag,
140
+ day_of_week, month, hour, platform):
141
+ """
142
+ Gradio wrapper for the prediction function.
143
+ Returns formatted outputs for both likes and shares.
144
+ """
145
+ if not models_loaded:
146
+ error_msg = "Models are not loaded. Please check the console for errors."
147
+ return 0, 0, error_msg, error_msg, error_msg
148
+
149
+ # Get predictions
150
+ likes_preds, shares_preds, error = predict_virality_all_models(
151
+ generation_time, gpu_usage, file_size_kb,
152
+ width, height, style_accuracy_score,
153
+ is_hand_edited, ethical_concerns_flag,
154
+ day_of_week, month, hour, platform
155
+ )
156
 
157
+ if error:
158
+ return 0, 0, error, error, error
 
 
 
159
 
160
+ # Get best model predictions
161
+ best_likes = likes_preds.get(BEST_MODEL_LIKES, 0)
162
+ best_shares = shares_preds.get(BEST_MODEL_SHARES, 0)
163
 
164
+ # Create comparison tables
165
+ likes_df = pd.DataFrame(list(likes_preds.items()), columns=['Model', 'Predicted Likes'])
166
+ likes_df = likes_df.sort_values('Predicted Likes', ascending=False)
167
+ likes_table = likes_df.to_markdown(index=False)
168
 
169
+ shares_df = pd.DataFrame(list(shares_preds.items()), columns=['Model', 'Predicted Shares'])
170
+ shares_df = shares_df.sort_values('Predicted Shares', ascending=False)
171
+ shares_table = shares_df.to_markdown(index=False)
 
172
 
173
+ # Create summary statistics
174
+ summary = f"""
175
+ ### Prediction Summary
 
 
176
 
177
+ **Average Predictions Across All Models:**
178
+ - Likes: {np.mean(list(likes_preds.values())):.0f}
179
+ - Shares: {np.mean(list(shares_preds.values())):.0f}
180
+ """
181
 
182
+ return best_likes, best_shares, likes_table, shares_table, summary
 
 
183
 
184
+ # Create Gradio interface
185
  with gr.Blocks(theme=gr.themes.Soft(), title="AI Image Virality Predictor") as demo:
186
  gr.Markdown("# 🎨 AI Ghibli Image Virality Predictor")
187
+ gr.Markdown("Predict both **Likes** and **Shares** for your AI-generated Ghibli-style images!")
188
 
189
  with gr.Row():
190
+ # Input Column
191
  with gr.Column(scale=2):
192
+ gr.Markdown("### 📝 Input Features")
193
+
194
+ with gr.Accordion("Image Properties", open=True):
195
+ width = gr.Slider(minimum=256, maximum=2048, value=1024, step=64,
196
+ label="Width (px)")
197
+ height = gr.Slider(minimum=256, maximum=2048, value=1024, step=64,
198
+ label="Height (px)")
199
+ file_size_kb = gr.Slider(minimum=100, maximum=5000, value=1500, step=100,
200
+ label="File Size (KB)")
201
+ style_accuracy_score = gr.Slider(minimum=0, maximum=100, value=85, step=1,
202
+ label="Style Accuracy Score (%)")
203
+
204
+ with gr.Accordion("Technical Details", open=True):
205
+ generation_time = gr.Slider(minimum=1, maximum=30, value=8, step=0.5,
206
+ label="Generation Time (seconds)")
207
+ gpu_usage = gr.Slider(minimum=10, maximum=100, value=70, step=5,
208
+ label="GPU Usage (%)")
209
+ is_hand_edited = gr.Checkbox(label="Hand Edited?", value=False)
210
+ ethical_concerns_flag = gr.Checkbox(label="Ethical Concerns?", value=False)
211
+
212
+ with gr.Accordion("Posting Details", open=True):
213
+ platform = gr.Radio(["Instagram", "Twitter", "TikTok", "Reddit"],
214
+ label="Platform", value="Instagram")
215
+ day_of_week = gr.Slider(minimum=0, maximum=6, value=4, step=1,
216
+ label="Day of Week (0=Mon, 6=Sun)")
217
+ month = gr.Slider(minimum=1, maximum=12, value=7, step=1,
218
+ label="Month (1-12)")
219
+ hour = gr.Slider(minimum=0, maximum=23, value=18, step=1,
220
+ label="Hour of Day (0-23)")
221
+
222
+ predict_btn = gr.Button("🚀 Predict Virality", variant="primary", size="lg")
223
+
224
+ # Output Column
225
  with gr.Column(scale=3):
226
+ gr.Markdown("### 📊 Prediction Results")
227
+
228
+ # Main predictions
229
+ with gr.Row():
230
+ best_likes_output = gr.Number(
231
+ label=f"❤️ Predicted Likes ({BEST_MODEL_LIKES})",
232
+ interactive=False
233
+ )
234
+ best_shares_output = gr.Number(
235
+ label=f"🔄 Predicted Shares ({BEST_MODEL_SHARES})",
236
+ interactive=False
237
+ )
238
+
239
+ # Summary
240
+ summary_output = gr.Markdown(label="Summary")
241
+
242
+ # Detailed predictions (continued)
243
+ with gr.Row():
244
+ with gr.Accordion("All Models - Likes", open=False):
245
+ likes_table_output = gr.Markdown(label="Likes Predictions")
246
+
247
+ with gr.Accordion("All Models - Shares", open=False):
248
+ shares_table_output = gr.Markdown(label="Shares Predictions")
249
 
250
  # Connect the button to the function
251
  predict_btn.click(
252
+ fn=predict_virality_gradio,
253
  inputs=[
254
+ generation_time, gpu_usage, file_size_kb,
255
  width, height, style_accuracy_score,
256
  is_hand_edited, ethical_concerns_flag,
257
  day_of_week, month, hour, platform
258
  ],
259
  outputs=[
260
+ best_likes_output,
261
+ best_shares_output,
262
+ likes_table_output,
263
+ shares_table_output,
264
+ summary_output
265
  ]
266
  )
267
 
268
  # Launch the app
269
  if __name__ == "__main__":
270
+ load_models()
271
  if not models_loaded:
272
  print("\nCannot launch Gradio app because models failed to load.")
273
  else:
274
+ demo.launch(
275
+ # share=True,
276
+ # debug=True
277
+ )
models/{gradient_boosting.joblib → gradient_boosting_likes.joblib} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4504743d4614b446377f47be3bbd48450c338a03720dd92bb7ad50e5c43fbec9
3
- size 349128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a90fbcfd2603f5d9d7737dccf118fbe0340a7f18561a2e33429e16b1b10804c5
3
+ size 382392
models/{random_forest.joblib → gradient_boosting_shares.joblib} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17bdbcb259ca107357017da4d5dfa698d046c4029856b33c79234c1b7467eee2
3
- size 1989649
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14c9e4969342ad4f929267d938f9ed1d15072a067effb231a37e25fe317469ba
3
+ size 369144
models/lasso_regression.joblib DELETED
Binary file (864 Bytes)
 
models/lasso_regression_likes.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf7af80e4d8afc3ef86c7917fb6edd207bd84f97d9b57c634904e9b0761c0247
3
+ size 848
models/lasso_regression_shares.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a025a3dd6b2d9f4faa5484294f697dc12b10dc61b4c2d8d37ce85b4050f64ed
3
+ size 848
models/linear_regression.joblib DELETED
Binary file (1.03 kB)
 
models/linear_regression_likes.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0ead3f5960d70f01ca3660610ca29863c02cc47f3a4a527f7a026bfec71243c
3
+ size 993
models/linear_regression_shares.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7ce5eaa9cf59e8e851ef5c3a763d7546ba7234fd8dc57de5bc419003d878bbe
3
+ size 993
models/random_forest_likes.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:161deab58e5ccd2cf240526943849d165ba4344058d140c3559aea0ee1ffe3b8
3
+ size 2197297
models/random_forest_shares.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b9de27b910354f61fa63f53f59af4a3d417d26f52d2542c09bad489b7729f6
3
+ size 2189073
models/ridge_regression.joblib DELETED
Binary file (785 Bytes)
 
models/ridge_regression_likes.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04682fcede103811ae2e8225c49845fc89ea436816d27cd655e3397c283a54ab
3
+ size 769
models/ridge_regression_shares.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68242a89f0a3b33a9409b856dc3ccc6720b9c11cbb7a19ef305e3c5691e2424d
3
+ size 769
models/scaler.joblib CHANGED
Binary files a/models/scaler.joblib and b/models/scaler.joblib differ
 
results/likes_model_comparison.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Model,R² Score,MAE,RMSE
2
+ Random Forest,-0.039935266835291694,1315.402911707407,1509.824436400549
3
+ Ridge Regression,-0.06857728183915435,1367.631134689801,1530.475089435485
4
+ Lasso Regression,-0.09319254230326957,1374.4757984522898,1548.0023905741184
5
+ Linear Regression,-0.10245590353474165,1377.3233648731018,1554.547191830137
6
+ Gradient Boosting,-0.2384770542071457,1414.7558634489615,1647.6587884958733
results/model_comparison.csv DELETED
@@ -1,6 +0,0 @@
1
- Model,R² Score,MAE,RMSE
2
- Random Forest,-0.08503617106593597,518.7684998150959,593.7506113831865
3
- Ridge Regression,-0.08676700700361528,528.5892908496174,594.223994396894
4
- Lasso Regression,-0.08764565917116918,528.573248905534,594.4641611976449
5
- Linear Regression,-0.09914175984109797,531.4132783380423,597.5975604762958
6
- Gradient Boosting,-0.2357883805937282,537.630957194942,633.6566769562837
 
 
 
 
 
 
 
results/regression_analysis_results.json DELETED
@@ -1,83 +0,0 @@
1
- {
2
- "dataset_info": {
3
- "total_samples": 500,
4
- "features": 29,
5
- "target_mean": 1040.182,
6
- "target_median": 1092.0,
7
- "target_std": 562.6687383302794
8
- },
9
- "model_comparison": [
10
- {
11
- "Model": "Random Forest",
12
- "R\u00b2 Score": -0.08503617106593597,
13
- "MAE": 518.7684998150959,
14
- "RMSE": 593.7506113831865
15
- },
16
- {
17
- "Model": "Ridge Regression",
18
- "R\u00b2 Score": -0.08676700700361528,
19
- "MAE": 528.5892908496174,
20
- "RMSE": 594.223994396894
21
- },
22
- {
23
- "Model": "Lasso Regression",
24
- "R\u00b2 Score": -0.08764565917116918,
25
- "MAE": 528.573248905534,
26
- "RMSE": 594.4641611976449
27
- },
28
- {
29
- "Model": "Linear Regression",
30
- "R\u00b2 Score": -0.09914175984109797,
31
- "MAE": 531.4132783380423,
32
- "RMSE": 597.5975604762958
33
- },
34
- {
35
- "Model": "Gradient Boosting",
36
- "R\u00b2 Score": -0.2357883805937282,
37
- "MAE": 537.630957194942,
38
- "RMSE": 633.6566769562837
39
- }
40
- ],
41
- "feature_correlations": [
42
- {
43
- "feature": "platform_Twitter",
44
- "correlation": -0.11310486794074195
45
- },
46
- {
47
- "feature": "platform_Instagram",
48
- "correlation": 0.07096989443791954
49
- },
50
- {
51
- "feature": "total_pixels",
52
- "correlation": 0.05340067376167711
53
- },
54
- {
55
- "feature": "width",
56
- "correlation": 0.050954190148673084
57
- },
58
- {
59
- "feature": "height",
60
- "correlation": 0.050954190148673084
61
- },
62
- {
63
- "feature": "platform_Reddit",
64
- "correlation": 0.030824709708669493
65
- },
66
- {
67
- "feature": "likes",
68
- "correlation": -0.029318071149881914
69
- },
70
- {
71
- "feature": "is_hand_edited",
72
- "correlation": 0.02824023580536551
73
- },
74
- {
75
- "feature": "day_of_week",
76
- "correlation": 0.02490306263783807
77
- },
78
- {
79
- "feature": "file_size_kb",
80
- "correlation": -0.020748477243945303
81
- }
82
- ]
83
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/shares_model_comparison.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Model,R² Score,MAE,RMSE
2
+ Ridge Regression,-0.07838647409138644,526.23107028139,591.928400457892
3
+ Lasso Regression,-0.07887883281663877,526.2428483005319,592.0635133610481
4
+ Random Forest,-0.08251842386606412,518.1355614972136,593.0613338289508
5
+ Linear Regression,-0.0852709784249106,527.3942652983482,593.8148532374418
6
+ Gradient Boosting,-0.22974105654409227,529.9244912155209,632.104377749423
results/virality_analysis_results.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_info": {
3
+ "total_samples": 500,
4
+ "features": 27,
5
+ "likes_mean": 2601.262,
6
+ "likes_median": 2566.5,
7
+ "likes_std": 1429.4334981408595,
8
+ "shares_mean": 1040.182,
9
+ "shares_median": 1092.0,
10
+ "shares_std": 562.6687383302794,
11
+ "likes_shares_correlation": -0.029318071149881914
12
+ },
13
+ "likes_model_comparison": [
14
+ {
15
+ "Model": "Random Forest",
16
+ "R\u00b2 Score": -0.039935266835291694,
17
+ "MAE": 1315.402911707407,
18
+ "RMSE": 1509.824436400549
19
+ },
20
+ {
21
+ "Model": "Ridge Regression",
22
+ "R\u00b2 Score": -0.06857728183915435,
23
+ "MAE": 1367.631134689801,
24
+ "RMSE": 1530.475089435485
25
+ },
26
+ {
27
+ "Model": "Lasso Regression",
28
+ "R\u00b2 Score": -0.09319254230326957,
29
+ "MAE": 1374.4757984522898,
30
+ "RMSE": 1548.0023905741184
31
+ },
32
+ {
33
+ "Model": "Linear Regression",
34
+ "R\u00b2 Score": -0.10245590353474165,
35
+ "MAE": 1377.3233648731018,
36
+ "RMSE": 1554.547191830137
37
+ },
38
+ {
39
+ "Model": "Gradient Boosting",
40
+ "R\u00b2 Score": -0.2384770542071457,
41
+ "MAE": 1414.7558634489615,
42
+ "RMSE": 1647.6587884958733
43
+ }
44
+ ],
45
+ "shares_model_comparison": [
46
+ {
47
+ "Model": "Ridge Regression",
48
+ "R\u00b2 Score": -0.07838647409138644,
49
+ "MAE": 526.23107028139,
50
+ "RMSE": 591.928400457892
51
+ },
52
+ {
53
+ "Model": "Lasso Regression",
54
+ "R\u00b2 Score": -0.07887883281663877,
55
+ "MAE": 526.2428483005319,
56
+ "RMSE": 592.0635133610481
57
+ },
58
+ {
59
+ "Model": "Random Forest",
60
+ "R\u00b2 Score": -0.08251842386606412,
61
+ "MAE": 518.1355614972136,
62
+ "RMSE": 593.0613338289508
63
+ },
64
+ {
65
+ "Model": "Linear Regression",
66
+ "R\u00b2 Score": -0.0852709784249106,
67
+ "MAE": 527.3942652983482,
68
+ "RMSE": 593.8148532374418
69
+ },
70
+ {
71
+ "Model": "Gradient Boosting",
72
+ "R\u00b2 Score": -0.22974105654409227,
73
+ "MAE": 529.9244912155209,
74
+ "RMSE": 632.104377749423
75
+ }
76
+ ],
77
+ "best_models": {
78
+ "likes": "Random Forest",
79
+ "shares": "Ridge Regression"
80
+ }
81
+ }
results/virality_summary_report.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ VIRALITY PREDICTION MODEL SUMMARY
3
+ ================================
4
+
5
+ Dataset Overview:
6
+ - Total samples: 500
7
+ - Features used: 27
8
+ - Correlation between Likes and Shares: -0.029
9
+
10
+ Best Models:
11
+ - Likes Prediction: Random Forest (R² = -0.040)
12
+ - Shares Prediction: Ridge Regression (R² = -0.078)
13
+
14
+ Key Insights:
15
+ 1. Both likes and shares show similar patterns in terms of model performance
16
+ 2. Tree-based models (Random Forest, Gradient Boosting) tend to perform better
17
+ 3. Technical features (generation time, GPU usage) and temporal features are important predictors
18
+ 4. Platform-specific patterns exist and should be considered for optimization
19
+
20
+ Recommendations:
21
+ 1. Use separate models for likes and shares predictions
22
+ 2. Consider ensemble methods for improved accuracy
23
+ 3. Regular retraining with new data is recommended
24
+ 4. Monitor feature importance to understand changing virality patterns