change logic to predict like and share. Before that my like is a feature which is not correct
Browse files- Data_Analytics_SHE_Course_Group_Assignment_Machine_Learning.ipynb +0 -0
- app.py +211 -137
- models/{gradient_boosting.joblib → gradient_boosting_likes.joblib} +2 -2
- models/{random_forest.joblib → gradient_boosting_shares.joblib} +2 -2
- models/lasso_regression.joblib +0 -0
- models/lasso_regression_likes.joblib +3 -0
- models/lasso_regression_shares.joblib +3 -0
- models/linear_regression.joblib +0 -0
- models/linear_regression_likes.joblib +3 -0
- models/linear_regression_shares.joblib +3 -0
- models/random_forest_likes.joblib +3 -0
- models/random_forest_shares.joblib +3 -0
- models/ridge_regression.joblib +0 -0
- models/ridge_regression_likes.joblib +3 -0
- models/ridge_regression_shares.joblib +3 -0
- models/scaler.joblib +0 -0
- results/likes_model_comparison.csv +6 -0
- results/model_comparison.csv +0 -6
- results/regression_analysis_results.json +0 -83
- results/shares_model_comparison.csv +6 -0
- results/virality_analysis_results.json +81 -0
- results/virality_summary_report.txt +24 -0
Data_Analytics_SHE_Course_Group_Assignment_Machine_Learning.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
app.py
CHANGED
@@ -1,72 +1,35 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import joblib
|
5 |
import os
|
6 |
|
7 |
-
#
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
if os.path.exists(filename):
|
25 |
-
all_models[name] = joblib.load(filename)
|
26 |
-
else:
|
27 |
-
raise FileNotFoundError(f"Model file not found: {filename}")
|
28 |
-
|
29 |
-
# Load the scaler
|
30 |
-
scaler_path = 'models/scaler.joblib'
|
31 |
-
if os.path.exists(scaler_path):
|
32 |
-
scaler = joblib.load(scaler_path)
|
33 |
-
else:
|
34 |
-
raise FileNotFoundError(f"Scaler file not found: {scaler_path}")
|
35 |
-
|
36 |
-
models_loaded = True
|
37 |
-
print("✅ All models and scaler loaded successfully!")
|
38 |
-
|
39 |
-
# Get the feature names the model was trained on from the scaler
|
40 |
-
expected_columns = scaler.feature_names_in_
|
41 |
-
print(f"Models expect {len(expected_columns)} features.")
|
42 |
-
|
43 |
-
except Exception as e:
|
44 |
-
print(f"❌ ERROR: Could not load models. {e}")
|
45 |
-
print("Please ensure all '.joblib' files are in the 'models/' directory.")
|
46 |
-
models_loaded = False
|
47 |
-
all_models = {}
|
48 |
-
scaler = None
|
49 |
-
expected_columns = []
|
50 |
-
|
51 |
-
# ==============================================================================
|
52 |
-
# 2. PREDICTION FUNCTION
|
53 |
-
# ==============================================================================
|
54 |
-
|
55 |
-
def predict_shares_all_models(likes, generation_time, gpu_usage, file_size_kb,
|
56 |
-
width, height, style_accuracy_score,
|
57 |
-
is_hand_edited, ethical_concerns_flag,
|
58 |
-
day_of_week, month, hour, platform):
|
59 |
"""
|
60 |
-
|
61 |
-
and returns formatted outputs for the Gradio interface.
|
62 |
"""
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
# --- Step A: Perform feature engineering ---
|
68 |
sample_data = {
|
69 |
-
'likes': likes,
|
70 |
'style_accuracy_score': style_accuracy_score,
|
71 |
'generation_time': generation_time,
|
72 |
'gpu_usage': gpu_usage,
|
@@ -79,125 +42,236 @@ def predict_shares_all_models(likes, generation_time, gpu_usage, file_size_kb,
|
|
79 |
'month': month,
|
80 |
'hour': hour
|
81 |
}
|
82 |
-
|
|
|
83 |
sample_data['aspect_ratio'] = width / height if height > 0 else 0
|
84 |
sample_data['total_pixels'] = width * height
|
85 |
sample_data['is_square'] = int(width == height)
|
86 |
sample_data['is_weekend'] = int(day_of_week >= 5)
|
87 |
-
|
|
|
88 |
for p in ['Twitter', 'TikTok', 'Reddit', 'Instagram']:
|
89 |
sample_data[f'platform_{p}'] = 1 if platform == p else 0
|
90 |
-
|
91 |
-
|
92 |
-
sample_data['quality_engagement'] = style_accuracy_score * likes / 100
|
93 |
sample_data['file_density'] = file_size_kb / (sample_data['total_pixels'] / 1000 + 1)
|
94 |
sample_data['gpu_efficiency'] = generation_time / (gpu_usage + 1)
|
95 |
-
|
96 |
-
|
97 |
-
sample_data[f'{p.lower()}_likes'] = likes * sample_data[f'platform_{p}']
|
98 |
-
|
99 |
sample_data['month_sin'] = np.sin(2 * np.pi * month / 12)
|
100 |
sample_data['month_cos'] = np.cos(2 * np.pi * month / 12)
|
101 |
sample_data['day_sin'] = np.sin(2 * np.pi * day_of_week / 7)
|
102 |
sample_data['day_cos'] = np.cos(2 * np.pi * day_of_week / 7)
|
103 |
-
|
104 |
-
|
|
|
|
|
105 |
sample_df = pd.DataFrame([sample_data])
|
106 |
sample_df = sample_df.reindex(columns=expected_columns, fill_value=0)
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
for name, model in all_models.items():
|
112 |
-
pred_value = model.predict(sample_scaled)[0]
|
113 |
-
predictions[name] = max(0, int(pred_value))
|
114 |
|
115 |
-
#
|
|
|
|
|
116 |
|
117 |
-
#
|
118 |
-
|
|
|
|
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
all_models_table = all_results_df.to_markdown(index=False)
|
124 |
|
125 |
-
#
|
126 |
-
|
127 |
-
|
128 |
-
features_df['Value'] = features_df['Value'].apply(lambda x: f"{x:.4f}" if isinstance(x, float) else x)
|
129 |
-
features_table = features_df.to_markdown(index=False)
|
130 |
|
131 |
-
|
|
|
|
|
|
|
132 |
|
133 |
-
|
134 |
-
# 3. GRADIO INTERFACE
|
135 |
-
# ==============================================================================
|
136 |
|
|
|
137 |
with gr.Blocks(theme=gr.themes.Soft(), title="AI Image Virality Predictor") as demo:
|
138 |
gr.Markdown("# 🎨 AI Ghibli Image Virality Predictor")
|
139 |
-
gr.Markdown("
|
140 |
|
141 |
with gr.Row():
|
142 |
-
#
|
143 |
with gr.Column(scale=2):
|
144 |
-
gr.Markdown("###
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
height = gr.Slider(minimum=256, maximum=2048, value=1024, step=64,
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
with gr.Column(scale=3):
|
166 |
-
gr.Markdown("###
|
167 |
-
|
168 |
-
#
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
# Connect the button to the function
|
183 |
predict_btn.click(
|
184 |
-
fn=
|
185 |
inputs=[
|
186 |
-
|
187 |
width, height, style_accuracy_score,
|
188 |
is_hand_edited, ethical_concerns_flag,
|
189 |
day_of_week, month, hour, platform
|
190 |
],
|
191 |
outputs=[
|
192 |
-
|
193 |
-
|
194 |
-
|
|
|
|
|
195 |
]
|
196 |
)
|
197 |
|
198 |
# Launch the app
|
199 |
if __name__ == "__main__":
|
|
|
200 |
if not models_loaded:
|
201 |
print("\nCannot launch Gradio app because models failed to load.")
|
202 |
else:
|
203 |
-
demo.launch(
|
|
|
|
|
|
|
|
1 |
+
# Gradio Demo App for Predicting Both Likes and Shares
|
2 |
+
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
6 |
import joblib
|
7 |
import os
|
8 |
|
9 |
+
# Best models (update based on your results)
|
10 |
+
BEST_MODEL_LIKES = 'Random Forest'
|
11 |
+
BEST_MODEL_SHARES = 'Random Forest'
|
12 |
+
models_loaded = False
|
13 |
+
|
14 |
+
# Global variables for models and scaler
|
15 |
+
all_models_likes = {}
|
16 |
+
all_models_shares = {}
|
17 |
+
model_names = []
|
18 |
+
scaler = None
|
19 |
+
expected_columns = []
|
20 |
+
|
21 |
+
# Prediction Function for Both Likes and Shares
|
22 |
+
def predict_virality_all_models(generation_time, gpu_usage, file_size_kb,
|
23 |
+
width, height, style_accuracy_score,
|
24 |
+
is_hand_edited, ethical_concerns_flag,
|
25 |
+
day_of_week, month, hour, platform):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
"""
|
27 |
+
Predicts both likes and shares using all loaded models.
|
|
|
28 |
"""
|
29 |
+
global all_models_likes, all_models_shares, model_names, scaler, expected_columns
|
30 |
+
|
31 |
+
# Create feature dictionary (WITHOUT likes)
|
|
|
|
|
32 |
sample_data = {
|
|
|
33 |
'style_accuracy_score': style_accuracy_score,
|
34 |
'generation_time': generation_time,
|
35 |
'gpu_usage': gpu_usage,
|
|
|
42 |
'month': month,
|
43 |
'hour': hour
|
44 |
}
|
45 |
+
|
46 |
+
# Perform feature engineering
|
47 |
sample_data['aspect_ratio'] = width / height if height > 0 else 0
|
48 |
sample_data['total_pixels'] = width * height
|
49 |
sample_data['is_square'] = int(width == height)
|
50 |
sample_data['is_weekend'] = int(day_of_week >= 5)
|
51 |
+
|
52 |
+
# One-hot encode platform
|
53 |
for p in ['Twitter', 'TikTok', 'Reddit', 'Instagram']:
|
54 |
sample_data[f'platform_{p}'] = 1 if platform == p else 0
|
55 |
+
|
56 |
+
# Technical features
|
|
|
57 |
sample_data['file_density'] = file_size_kb / (sample_data['total_pixels'] / 1000 + 1)
|
58 |
sample_data['gpu_efficiency'] = generation_time / (gpu_usage + 1)
|
59 |
+
|
60 |
+
# Temporal cyclical features (continued)
|
|
|
|
|
61 |
sample_data['month_sin'] = np.sin(2 * np.pi * month / 12)
|
62 |
sample_data['month_cos'] = np.cos(2 * np.pi * month / 12)
|
63 |
sample_data['day_sin'] = np.sin(2 * np.pi * day_of_week / 7)
|
64 |
sample_data['day_cos'] = np.cos(2 * np.pi * day_of_week / 7)
|
65 |
+
sample_data['hour_sin'] = np.sin(2 * np.pi * hour / 24)
|
66 |
+
sample_data['hour_cos'] = np.cos(2 * np.pi * hour / 24)
|
67 |
+
|
68 |
+
# Create DataFrame and align columns
|
69 |
sample_df = pd.DataFrame([sample_data])
|
70 |
sample_df = sample_df.reindex(columns=expected_columns, fill_value=0)
|
71 |
+
|
72 |
+
# Scale features
|
73 |
+
try:
|
74 |
+
sample_scaled = scaler.transform(sample_df)
|
75 |
+
except Exception as e:
|
76 |
+
return {}, {}, f"Error during scaling: {e}"
|
77 |
+
|
78 |
+
# Predict with all models
|
79 |
+
predictions_likes = {}
|
80 |
+
predictions_shares = {}
|
81 |
+
|
82 |
+
for name in model_names:
|
83 |
+
# Predict likes
|
84 |
+
if name in all_models_likes:
|
85 |
+
pred_likes = all_models_likes[name].predict(sample_scaled)[0]
|
86 |
+
predictions_likes[name] = max(0, int(pred_likes))
|
87 |
+
|
88 |
+
# Predict shares
|
89 |
+
if name in all_models_shares:
|
90 |
+
pred_shares = all_models_shares[name].predict(sample_scaled)[0]
|
91 |
+
predictions_shares[name] = max(0, int(pred_shares))
|
92 |
+
|
93 |
+
return predictions_likes, predictions_shares, None
|
94 |
+
|
95 |
+
def load_models():
|
96 |
+
# Load Models for Both Likes and Shares
|
97 |
+
global all_models_likes, all_models_shares, model_names, scaler, expected_columns, models_loaded
|
98 |
+
|
99 |
+
# Dictionaries to hold the loaded model objects
|
100 |
+
all_models_likes = {}
|
101 |
+
all_models_shares = {}
|
102 |
+
model_names = [
|
103 |
+
'Linear Regression', 'Ridge Regression', 'Lasso Regression',
|
104 |
+
'Random Forest', 'Gradient Boosting'
|
105 |
+
]
|
106 |
+
|
107 |
+
try:
|
108 |
+
# Load all the regression models for both targets
|
109 |
+
for name in model_names:
|
110 |
+
# Load likes model
|
111 |
+
filename_likes = os.path.join("models", f"{name.lower().replace(' ', '_')}_likes.joblib")
|
112 |
+
all_models_likes[name] = joblib.load(filename_likes)
|
113 |
+
|
114 |
+
# Load shares model
|
115 |
+
filename_shares = os.path.join("models", f"{name.lower().replace(' ', '_')}_shares.joblib")
|
116 |
+
all_models_shares[name] = joblib.load(filename_shares)
|
117 |
+
|
118 |
+
print(f"Loaded: {name} (both likes and shares)")
|
119 |
+
|
120 |
+
# Load the scaler
|
121 |
+
scaler = joblib.load(os.path.join("models", "scaler.joblib"))
|
122 |
+
print("Loaded: scaler.joblib")
|
123 |
+
|
124 |
+
# Get the feature names
|
125 |
+
expected_columns = scaler.feature_names_in_
|
126 |
+
print(f"Model expects {len(expected_columns)} features.")
|
127 |
+
|
128 |
+
models_loaded = True
|
129 |
+
print("\n✅ All models and scaler loaded successfully!")
|
130 |
+
|
131 |
+
except FileNotFoundError as e:
|
132 |
+
print(f"\n❌ ERROR: Could not find a model file: {e}")
|
133 |
+
print("Please make sure all '.joblib' files are in the 'models/' directory.")
|
134 |
+
models_loaded = False
|
135 |
+
|
136 |
+
|
137 |
+
def predict_virality_gradio(generation_time, gpu_usage, file_size_kb,
|
138 |
+
width, height, style_accuracy_score,
|
139 |
+
is_hand_edited, ethical_concerns_flag,
|
140 |
+
day_of_week, month, hour, platform):
|
141 |
+
"""
|
142 |
+
Gradio wrapper for the prediction function.
|
143 |
+
Returns formatted outputs for both likes and shares.
|
144 |
+
"""
|
145 |
+
if not models_loaded:
|
146 |
+
error_msg = "Models are not loaded. Please check the console for errors."
|
147 |
+
return 0, 0, error_msg, error_msg, error_msg
|
148 |
+
|
149 |
+
# Get predictions
|
150 |
+
likes_preds, shares_preds, error = predict_virality_all_models(
|
151 |
+
generation_time, gpu_usage, file_size_kb,
|
152 |
+
width, height, style_accuracy_score,
|
153 |
+
is_hand_edited, ethical_concerns_flag,
|
154 |
+
day_of_week, month, hour, platform
|
155 |
+
)
|
156 |
|
157 |
+
if error:
|
158 |
+
return 0, 0, error, error, error
|
|
|
|
|
|
|
159 |
|
160 |
+
# Get best model predictions
|
161 |
+
best_likes = likes_preds.get(BEST_MODEL_LIKES, 0)
|
162 |
+
best_shares = shares_preds.get(BEST_MODEL_SHARES, 0)
|
163 |
|
164 |
+
# Create comparison tables
|
165 |
+
likes_df = pd.DataFrame(list(likes_preds.items()), columns=['Model', 'Predicted Likes'])
|
166 |
+
likes_df = likes_df.sort_values('Predicted Likes', ascending=False)
|
167 |
+
likes_table = likes_df.to_markdown(index=False)
|
168 |
|
169 |
+
shares_df = pd.DataFrame(list(shares_preds.items()), columns=['Model', 'Predicted Shares'])
|
170 |
+
shares_df = shares_df.sort_values('Predicted Shares', ascending=False)
|
171 |
+
shares_table = shares_df.to_markdown(index=False)
|
|
|
172 |
|
173 |
+
# Create summary statistics
|
174 |
+
summary = f"""
|
175 |
+
### Prediction Summary
|
|
|
|
|
176 |
|
177 |
+
**Average Predictions Across All Models:**
|
178 |
+
- Likes: {np.mean(list(likes_preds.values())):.0f}
|
179 |
+
- Shares: {np.mean(list(shares_preds.values())):.0f}
|
180 |
+
"""
|
181 |
|
182 |
+
return best_likes, best_shares, likes_table, shares_table, summary
|
|
|
|
|
183 |
|
184 |
+
# Create Gradio interface
|
185 |
with gr.Blocks(theme=gr.themes.Soft(), title="AI Image Virality Predictor") as demo:
|
186 |
gr.Markdown("# 🎨 AI Ghibli Image Virality Predictor")
|
187 |
+
gr.Markdown("Predict both **Likes** and **Shares** for your AI-generated Ghibli-style images!")
|
188 |
|
189 |
with gr.Row():
|
190 |
+
# Input Column
|
191 |
with gr.Column(scale=2):
|
192 |
+
gr.Markdown("### 📝 Input Features")
|
193 |
+
|
194 |
+
with gr.Accordion("Image Properties", open=True):
|
195 |
+
width = gr.Slider(minimum=256, maximum=2048, value=1024, step=64,
|
196 |
+
label="Width (px)")
|
197 |
+
height = gr.Slider(minimum=256, maximum=2048, value=1024, step=64,
|
198 |
+
label="Height (px)")
|
199 |
+
file_size_kb = gr.Slider(minimum=100, maximum=5000, value=1500, step=100,
|
200 |
+
label="File Size (KB)")
|
201 |
+
style_accuracy_score = gr.Slider(minimum=0, maximum=100, value=85, step=1,
|
202 |
+
label="Style Accuracy Score (%)")
|
203 |
+
|
204 |
+
with gr.Accordion("Technical Details", open=True):
|
205 |
+
generation_time = gr.Slider(minimum=1, maximum=30, value=8, step=0.5,
|
206 |
+
label="Generation Time (seconds)")
|
207 |
+
gpu_usage = gr.Slider(minimum=10, maximum=100, value=70, step=5,
|
208 |
+
label="GPU Usage (%)")
|
209 |
+
is_hand_edited = gr.Checkbox(label="Hand Edited?", value=False)
|
210 |
+
ethical_concerns_flag = gr.Checkbox(label="Ethical Concerns?", value=False)
|
211 |
+
|
212 |
+
with gr.Accordion("Posting Details", open=True):
|
213 |
+
platform = gr.Radio(["Instagram", "Twitter", "TikTok", "Reddit"],
|
214 |
+
label="Platform", value="Instagram")
|
215 |
+
day_of_week = gr.Slider(minimum=0, maximum=6, value=4, step=1,
|
216 |
+
label="Day of Week (0=Mon, 6=Sun)")
|
217 |
+
month = gr.Slider(minimum=1, maximum=12, value=7, step=1,
|
218 |
+
label="Month (1-12)")
|
219 |
+
hour = gr.Slider(minimum=0, maximum=23, value=18, step=1,
|
220 |
+
label="Hour of Day (0-23)")
|
221 |
+
|
222 |
+
predict_btn = gr.Button("🚀 Predict Virality", variant="primary", size="lg")
|
223 |
+
|
224 |
+
# Output Column
|
225 |
with gr.Column(scale=3):
|
226 |
+
gr.Markdown("### 📊 Prediction Results")
|
227 |
+
|
228 |
+
# Main predictions
|
229 |
+
with gr.Row():
|
230 |
+
best_likes_output = gr.Number(
|
231 |
+
label=f"❤️ Predicted Likes ({BEST_MODEL_LIKES})",
|
232 |
+
interactive=False
|
233 |
+
)
|
234 |
+
best_shares_output = gr.Number(
|
235 |
+
label=f"🔄 Predicted Shares ({BEST_MODEL_SHARES})",
|
236 |
+
interactive=False
|
237 |
+
)
|
238 |
+
|
239 |
+
# Summary
|
240 |
+
summary_output = gr.Markdown(label="Summary")
|
241 |
+
|
242 |
+
# Detailed predictions (continued)
|
243 |
+
with gr.Row():
|
244 |
+
with gr.Accordion("All Models - Likes", open=False):
|
245 |
+
likes_table_output = gr.Markdown(label="Likes Predictions")
|
246 |
+
|
247 |
+
with gr.Accordion("All Models - Shares", open=False):
|
248 |
+
shares_table_output = gr.Markdown(label="Shares Predictions")
|
249 |
|
250 |
# Connect the button to the function
|
251 |
predict_btn.click(
|
252 |
+
fn=predict_virality_gradio,
|
253 |
inputs=[
|
254 |
+
generation_time, gpu_usage, file_size_kb,
|
255 |
width, height, style_accuracy_score,
|
256 |
is_hand_edited, ethical_concerns_flag,
|
257 |
day_of_week, month, hour, platform
|
258 |
],
|
259 |
outputs=[
|
260 |
+
best_likes_output,
|
261 |
+
best_shares_output,
|
262 |
+
likes_table_output,
|
263 |
+
shares_table_output,
|
264 |
+
summary_output
|
265 |
]
|
266 |
)
|
267 |
|
268 |
# Launch the app
|
269 |
if __name__ == "__main__":
|
270 |
+
load_models()
|
271 |
if not models_loaded:
|
272 |
print("\nCannot launch Gradio app because models failed to load.")
|
273 |
else:
|
274 |
+
demo.launch(
|
275 |
+
# share=True,
|
276 |
+
# debug=True
|
277 |
+
)
|
models/{gradient_boosting.joblib → gradient_boosting_likes.joblib}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a90fbcfd2603f5d9d7737dccf118fbe0340a7f18561a2e33429e16b1b10804c5
|
3 |
+
size 382392
|
models/{random_forest.joblib → gradient_boosting_shares.joblib}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:14c9e4969342ad4f929267d938f9ed1d15072a067effb231a37e25fe317469ba
|
3 |
+
size 369144
|
models/lasso_regression.joblib
DELETED
Binary file (864 Bytes)
|
|
models/lasso_regression_likes.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf7af80e4d8afc3ef86c7917fb6edd207bd84f97d9b57c634904e9b0761c0247
|
3 |
+
size 848
|
models/lasso_regression_shares.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a025a3dd6b2d9f4faa5484294f697dc12b10dc61b4c2d8d37ce85b4050f64ed
|
3 |
+
size 848
|
models/linear_regression.joblib
DELETED
Binary file (1.03 kB)
|
|
models/linear_regression_likes.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0ead3f5960d70f01ca3660610ca29863c02cc47f3a4a527f7a026bfec71243c
|
3 |
+
size 993
|
models/linear_regression_shares.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d7ce5eaa9cf59e8e851ef5c3a763d7546ba7234fd8dc57de5bc419003d878bbe
|
3 |
+
size 993
|
models/random_forest_likes.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:161deab58e5ccd2cf240526943849d165ba4344058d140c3559aea0ee1ffe3b8
|
3 |
+
size 2197297
|
models/random_forest_shares.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3b9de27b910354f61fa63f53f59af4a3d417d26f52d2542c09bad489b7729f6
|
3 |
+
size 2189073
|
models/ridge_regression.joblib
DELETED
Binary file (785 Bytes)
|
|
models/ridge_regression_likes.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04682fcede103811ae2e8225c49845fc89ea436816d27cd655e3397c283a54ab
|
3 |
+
size 769
|
models/ridge_regression_shares.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:68242a89f0a3b33a9409b856dc3ccc6720b9c11cbb7a19ef305e3c5691e2424d
|
3 |
+
size 769
|
models/scaler.joblib
CHANGED
Binary files a/models/scaler.joblib and b/models/scaler.joblib differ
|
|
results/likes_model_comparison.csv
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,R² Score,MAE,RMSE
|
2 |
+
Random Forest,-0.039935266835291694,1315.402911707407,1509.824436400549
|
3 |
+
Ridge Regression,-0.06857728183915435,1367.631134689801,1530.475089435485
|
4 |
+
Lasso Regression,-0.09319254230326957,1374.4757984522898,1548.0023905741184
|
5 |
+
Linear Regression,-0.10245590353474165,1377.3233648731018,1554.547191830137
|
6 |
+
Gradient Boosting,-0.2384770542071457,1414.7558634489615,1647.6587884958733
|
results/model_comparison.csv
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
Model,R² Score,MAE,RMSE
|
2 |
-
Random Forest,-0.08503617106593597,518.7684998150959,593.7506113831865
|
3 |
-
Ridge Regression,-0.08676700700361528,528.5892908496174,594.223994396894
|
4 |
-
Lasso Regression,-0.08764565917116918,528.573248905534,594.4641611976449
|
5 |
-
Linear Regression,-0.09914175984109797,531.4132783380423,597.5975604762958
|
6 |
-
Gradient Boosting,-0.2357883805937282,537.630957194942,633.6566769562837
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/regression_analysis_results.json
DELETED
@@ -1,83 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"dataset_info": {
|
3 |
-
"total_samples": 500,
|
4 |
-
"features": 29,
|
5 |
-
"target_mean": 1040.182,
|
6 |
-
"target_median": 1092.0,
|
7 |
-
"target_std": 562.6687383302794
|
8 |
-
},
|
9 |
-
"model_comparison": [
|
10 |
-
{
|
11 |
-
"Model": "Random Forest",
|
12 |
-
"R\u00b2 Score": -0.08503617106593597,
|
13 |
-
"MAE": 518.7684998150959,
|
14 |
-
"RMSE": 593.7506113831865
|
15 |
-
},
|
16 |
-
{
|
17 |
-
"Model": "Ridge Regression",
|
18 |
-
"R\u00b2 Score": -0.08676700700361528,
|
19 |
-
"MAE": 528.5892908496174,
|
20 |
-
"RMSE": 594.223994396894
|
21 |
-
},
|
22 |
-
{
|
23 |
-
"Model": "Lasso Regression",
|
24 |
-
"R\u00b2 Score": -0.08764565917116918,
|
25 |
-
"MAE": 528.573248905534,
|
26 |
-
"RMSE": 594.4641611976449
|
27 |
-
},
|
28 |
-
{
|
29 |
-
"Model": "Linear Regression",
|
30 |
-
"R\u00b2 Score": -0.09914175984109797,
|
31 |
-
"MAE": 531.4132783380423,
|
32 |
-
"RMSE": 597.5975604762958
|
33 |
-
},
|
34 |
-
{
|
35 |
-
"Model": "Gradient Boosting",
|
36 |
-
"R\u00b2 Score": -0.2357883805937282,
|
37 |
-
"MAE": 537.630957194942,
|
38 |
-
"RMSE": 633.6566769562837
|
39 |
-
}
|
40 |
-
],
|
41 |
-
"feature_correlations": [
|
42 |
-
{
|
43 |
-
"feature": "platform_Twitter",
|
44 |
-
"correlation": -0.11310486794074195
|
45 |
-
},
|
46 |
-
{
|
47 |
-
"feature": "platform_Instagram",
|
48 |
-
"correlation": 0.07096989443791954
|
49 |
-
},
|
50 |
-
{
|
51 |
-
"feature": "total_pixels",
|
52 |
-
"correlation": 0.05340067376167711
|
53 |
-
},
|
54 |
-
{
|
55 |
-
"feature": "width",
|
56 |
-
"correlation": 0.050954190148673084
|
57 |
-
},
|
58 |
-
{
|
59 |
-
"feature": "height",
|
60 |
-
"correlation": 0.050954190148673084
|
61 |
-
},
|
62 |
-
{
|
63 |
-
"feature": "platform_Reddit",
|
64 |
-
"correlation": 0.030824709708669493
|
65 |
-
},
|
66 |
-
{
|
67 |
-
"feature": "likes",
|
68 |
-
"correlation": -0.029318071149881914
|
69 |
-
},
|
70 |
-
{
|
71 |
-
"feature": "is_hand_edited",
|
72 |
-
"correlation": 0.02824023580536551
|
73 |
-
},
|
74 |
-
{
|
75 |
-
"feature": "day_of_week",
|
76 |
-
"correlation": 0.02490306263783807
|
77 |
-
},
|
78 |
-
{
|
79 |
-
"feature": "file_size_kb",
|
80 |
-
"correlation": -0.020748477243945303
|
81 |
-
}
|
82 |
-
]
|
83 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/shares_model_comparison.csv
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,R² Score,MAE,RMSE
|
2 |
+
Ridge Regression,-0.07838647409138644,526.23107028139,591.928400457892
|
3 |
+
Lasso Regression,-0.07887883281663877,526.2428483005319,592.0635133610481
|
4 |
+
Random Forest,-0.08251842386606412,518.1355614972136,593.0613338289508
|
5 |
+
Linear Regression,-0.0852709784249106,527.3942652983482,593.8148532374418
|
6 |
+
Gradient Boosting,-0.22974105654409227,529.9244912155209,632.104377749423
|
results/virality_analysis_results.json
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset_info": {
|
3 |
+
"total_samples": 500,
|
4 |
+
"features": 27,
|
5 |
+
"likes_mean": 2601.262,
|
6 |
+
"likes_median": 2566.5,
|
7 |
+
"likes_std": 1429.4334981408595,
|
8 |
+
"shares_mean": 1040.182,
|
9 |
+
"shares_median": 1092.0,
|
10 |
+
"shares_std": 562.6687383302794,
|
11 |
+
"likes_shares_correlation": -0.029318071149881914
|
12 |
+
},
|
13 |
+
"likes_model_comparison": [
|
14 |
+
{
|
15 |
+
"Model": "Random Forest",
|
16 |
+
"R\u00b2 Score": -0.039935266835291694,
|
17 |
+
"MAE": 1315.402911707407,
|
18 |
+
"RMSE": 1509.824436400549
|
19 |
+
},
|
20 |
+
{
|
21 |
+
"Model": "Ridge Regression",
|
22 |
+
"R\u00b2 Score": -0.06857728183915435,
|
23 |
+
"MAE": 1367.631134689801,
|
24 |
+
"RMSE": 1530.475089435485
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"Model": "Lasso Regression",
|
28 |
+
"R\u00b2 Score": -0.09319254230326957,
|
29 |
+
"MAE": 1374.4757984522898,
|
30 |
+
"RMSE": 1548.0023905741184
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"Model": "Linear Regression",
|
34 |
+
"R\u00b2 Score": -0.10245590353474165,
|
35 |
+
"MAE": 1377.3233648731018,
|
36 |
+
"RMSE": 1554.547191830137
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"Model": "Gradient Boosting",
|
40 |
+
"R\u00b2 Score": -0.2384770542071457,
|
41 |
+
"MAE": 1414.7558634489615,
|
42 |
+
"RMSE": 1647.6587884958733
|
43 |
+
}
|
44 |
+
],
|
45 |
+
"shares_model_comparison": [
|
46 |
+
{
|
47 |
+
"Model": "Ridge Regression",
|
48 |
+
"R\u00b2 Score": -0.07838647409138644,
|
49 |
+
"MAE": 526.23107028139,
|
50 |
+
"RMSE": 591.928400457892
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"Model": "Lasso Regression",
|
54 |
+
"R\u00b2 Score": -0.07887883281663877,
|
55 |
+
"MAE": 526.2428483005319,
|
56 |
+
"RMSE": 592.0635133610481
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"Model": "Random Forest",
|
60 |
+
"R\u00b2 Score": -0.08251842386606412,
|
61 |
+
"MAE": 518.1355614972136,
|
62 |
+
"RMSE": 593.0613338289508
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"Model": "Linear Regression",
|
66 |
+
"R\u00b2 Score": -0.0852709784249106,
|
67 |
+
"MAE": 527.3942652983482,
|
68 |
+
"RMSE": 593.8148532374418
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"Model": "Gradient Boosting",
|
72 |
+
"R\u00b2 Score": -0.22974105654409227,
|
73 |
+
"MAE": 529.9244912155209,
|
74 |
+
"RMSE": 632.104377749423
|
75 |
+
}
|
76 |
+
],
|
77 |
+
"best_models": {
|
78 |
+
"likes": "Random Forest",
|
79 |
+
"shares": "Ridge Regression"
|
80 |
+
}
|
81 |
+
}
|
results/virality_summary_report.txt
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
VIRALITY PREDICTION MODEL SUMMARY
|
3 |
+
================================
|
4 |
+
|
5 |
+
Dataset Overview:
|
6 |
+
- Total samples: 500
|
7 |
+
- Features used: 27
|
8 |
+
- Correlation between Likes and Shares: -0.029
|
9 |
+
|
10 |
+
Best Models:
|
11 |
+
- Likes Prediction: Random Forest (R² = -0.040)
|
12 |
+
- Shares Prediction: Ridge Regression (R² = -0.078)
|
13 |
+
|
14 |
+
Key Insights:
|
15 |
+
1. Both likes and shares show similar patterns in terms of model performance
|
16 |
+
2. Tree-based models (Random Forest, Gradient Boosting) tend to perform better
|
17 |
+
3. Technical features (generation time, GPU usage) and temporal features are important predictors
|
18 |
+
4. Platform-specific patterns exist and should be considered for optimization
|
19 |
+
|
20 |
+
Recommendations:
|
21 |
+
1. Use separate models for likes and shares predictions
|
22 |
+
2. Consider ensemble methods for improved accuracy
|
23 |
+
3. Regular retraining with new data is recommended
|
24 |
+
4. Monitor feature importance to understand changing virality patterns
|