Andy Lau commited on
Commit
41e5416
·
1 Parent(s): 8170b82
Files changed (1) hide show
  1. app.py +94 -75
app.py CHANGED
@@ -39,8 +39,8 @@ def table_data():
39
  ]
40
 
41
  data = {
42
- 'Field':field,
43
- 'Data':data
44
  }
45
 
46
  df = pd.DataFrame.from_dict(data)
@@ -48,26 +48,29 @@ def table_data():
48
  return df
49
 
50
 
51
-
52
- def url_button(button_name,url):
53
  if st.button(button_name):
54
- js = """window.open('{url}')""".format(url=url) # New tab or window
55
  html = '<img src onerror="{}">'.format(js)
56
  div = Div(text=html)
57
  st.bokeh_chart(div)
58
 
 
59
  def get_industry_code_dict(training_dataset):
60
- training_dataset['industry_code'] = training_dataset['industry'].astype('category')
 
61
  cat_columns = training_dataset.select_dtypes(['category']).columns
62
- training_dataset[cat_columns] = training_dataset[cat_columns].apply(lambda x: x.cat.codes)
63
- industry_code_dict = dict(zip(training_dataset.industry, training_dataset.industry_code))
 
 
64
  return industry_code_dict
65
 
66
 
67
- ## extract email body from parse email
68
- def email_body_extractor(email_data):
69
  # email_data = parsed_email.data[0]
70
- emailstr = email_data.decode("utf-8")
71
  b = email.message_from_string(emailstr)
72
  body = ""
73
 
@@ -83,20 +86,20 @@ def email_body_extractor(email_data):
83
  # not multipart - i.e. plain text, no attachments, keeping fingers crossed
84
  else:
85
  body = b.get_payload()
86
- ## Remove escape sequences
87
  body = body.replace('\n', '')
88
  body = body.replace('\t', '')
89
  body = body.replace('\r', '')
90
  body = body.replace('</b>', '')
91
  body = body.replace('<b>', '')
92
 
93
-
94
- ## Extract urls in the email body and get url counts
95
  extractor = URLExtract()
96
  urls = extractor.find_urls(body)
97
  url_cnt = len(urls)
98
- ## Remove urls
99
- body = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', body)
 
100
  sep = '©'
101
  body = body.split(sep, 1)[0]
102
  character_cnt = sum(not chr.isspace() for chr in body)
@@ -106,7 +109,7 @@ def email_body_extractor(email_data):
106
 
107
  def add_bg_from_url():
108
  st.markdown(
109
- f"""
110
  <style>
111
  .stApp {{
112
  background-image: linear-gradient(#0A3144,#126072,#1C8D99);
@@ -115,10 +118,11 @@ def add_bg_from_url():
115
  }}
116
  </style>
117
  """,
118
- unsafe_allow_html=True
119
- )
 
 
120
 
121
- # add_bg_from_url()
122
 
123
  st.markdown("# Character Count: Email Industry")
124
 
@@ -128,7 +132,7 @@ st.markdown("# Character Count: Email Industry")
128
  # img = PIL.Image.open("figures/ModelCC_solid.png")
129
  # st.image(img)
130
 
131
- stats_col1, stats_col2, stats_col3, stats_col4 = st.columns([1,1,1,1])
132
 
133
  with stats_col1:
134
  st.metric(label="Production", value="Production")
@@ -142,7 +146,6 @@ with stats_col4:
142
  st.metric(label="Industry", value="Email")
143
 
144
 
145
-
146
  with st.sidebar:
147
 
148
  with st.expander('Model Description', expanded=False):
@@ -160,9 +163,9 @@ with st.sidebar:
160
  st.markdown(hide_table_row_index, unsafe_allow_html=True)
161
  st.table(table_data())
162
 
163
- url_button('Model Homepage','https://www.loxz.com/#/models/CTA')
164
  # url_button('Full Report','https://resources.loxz.com/reports/realtime-ml-character-count-model')
165
- url_button('Amazon Market Place','https://aws.amazon.com/marketplace')
166
 
167
 
168
  industry_lists = [
@@ -178,24 +181,25 @@ industry_lists = [
178
  ]
179
 
180
  campaign_types = [
181
- 'Promotional',
182
- 'Transactional',
183
- 'Webinar',
184
- 'Survey',
185
- 'Newsletter',
186
  'Engagement',
187
- 'Curated_Content',
188
- 'Review_Request',
189
- 'Product_Announcement',
190
  'Abandoned_Cart'
191
  ]
192
 
193
  target_variables = [
194
- 'conversion_rate',
195
  'click_to_open_rate'
196
  ]
197
 
198
- uploaded_file = st.file_uploader("Please upload your email (In HTML Format)", type=["html"])
 
199
 
200
  if uploaded_file is None:
201
  # upload_img = PIL.Image.open(uploaded_file)
@@ -210,8 +214,8 @@ industry = st.selectbox(
210
  index=6
211
  )
212
 
213
- campaign = st.selectbox(
214
- 'Please select your industry',
215
  campaign_types,
216
  index=5
217
  )
@@ -230,24 +234,24 @@ char_reco_preference = st.selectbox(
230
  index=1)
231
 
232
 
233
- def get_files_from_aws(bucket,prefix):
234
  """
235
  get files from aws s3 bucket
236
-
237
  bucket (STRING): bucket name
238
  prefix (STRING): file location in s3 bucket
239
  """
240
  s3_client = boto3.client('s3',
241
- aws_access_key_id = st.secrets["aws_id"],
242
- aws_secret_access_key = st.secrets["aws_key"])
243
 
244
- file_obj = s3_client.get_object(Bucket=bucket,Key=prefix)
245
  body = file_obj['Body']
246
  string = body.read().decode('utf-8')
247
-
248
  df = pd.read_csv(StringIO(string))
249
-
250
- return df
251
 
252
 
253
  # st.info([industry,campaign,target,char_reco_preference])
@@ -264,20 +268,26 @@ if st.button('Generate Predictions'):
264
  # Starting predictions
265
  model = joblib.load('models/models.sav')
266
  # Generate Email Data
267
- email_data = get_files_from_aws('emailcampaigntrainingdata','trainingdata/email_dataset_training.csv')
268
- acc_data = get_files_from_aws('emailcampaigntrainingdata','trainingdata/email_dataset_training_raw.csv')
269
-
270
- email_data_ = email_data[["email_body", "industry", "campaign_type","character_cnt", "url_cnt","Open_Rate", "Click_Through_Rate"]]
271
- email_data_ = email_data_.rename({'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})
272
- df_email_data = email_data_.rename(columns={'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})
 
 
 
 
 
273
 
274
  # Dataset:
275
- training_dataset = get_files_from_aws('emailcampaigntrainingdata','modelCC/training.csv')
 
276
  # X_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/Xtest.csv')
277
  # Y_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/ytest.csv')
278
 
279
  # print("Getting Data Time: %s seconds" % (time.time() - start_time))
280
-
281
  industry_code_dict = get_industry_code_dict(email_data)
282
  bytes_data = uploaded_file.getvalue()
283
 
@@ -287,24 +297,28 @@ if st.button('Generate Predictions'):
287
  # Need to solve X test issue
288
 
289
  # y_pred = model.predict(X_test)
290
- df_uploaded = pd.DataFrame(columns=['character_cnt', "url_cnt", "industry"])
 
291
  df_uploaded.loc[0] = [character_cnt, url_cnt, industry]
292
  df_uploaded["industry_code"] = industry_code_dict.get(industry)
293
- df_uploaded_test = df_uploaded[["industry_code", "character_cnt", "url_cnt"]]
294
- predicted_rate = model.predict(df_uploaded_test)[0]
295
- output_rate = round(predicted_rate,4)
296
-
297
 
298
  if output_rate < 0:
299
- print("Sorry, Current model couldn't provide predictions on the target variable you selected.")
 
300
  else:
301
- st.markdown('#### Current Character Count in Your Email is: <span style="color:blue">{}</span>'.format(character_cnt), unsafe_allow_html=True)
 
302
  # st.info('The model predicts that it achieves a {} of {}%'.format(target, str(round(output_rate*100,2))))
303
  if target == 'conversion_rate':
304
  target_vis = 'Conversion Rate'
305
  else:
306
  target_vis = 'Click-to-Open Rate'
307
- st.markdown('#### The model predicts that it achieves a <span style="color:blue">{}</span> of <span style="color:blue">{}</span>%'.format(target_vis, str(round(output_rate*100,2))),unsafe_allow_html=True)
 
308
  selected_industry_code = industry_code_dict.get(industry)
309
 
310
  if target == "click_to_open_rate":
@@ -312,38 +326,43 @@ if st.button('Generate Predictions'):
312
  if target == "conversion_rate":
313
  selected_variable = "Click_Through_Rate"
314
 
315
-
316
- df_reco = training_dataset[["industry_code", "character_cnt", "url_cnt", selected_variable]]
317
- df_reco = df_reco[df_reco["industry_code"] == selected_industry_code]
318
- df_reco[selected_variable]=df_reco[selected_variable].apply(lambda x:round(x, 3))
 
 
319
  df_reco_sort = df_reco.sort_values(by=[selected_variable])
320
  df_reco = df_reco.drop_duplicates(subset=selected_variable)
321
 
322
  preference = char_reco_preference
323
  if preference == "Increase":
324
- df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] > character_cnt) & (df_reco["character_cnt"] <= (1.5*character_cnt))]
325
- df_reco_opt_rank = df_reco_opt.nlargest(3,[selected_variable])
326
- ## decrease character reco
 
327
  if preference == "Decrease":
328
- df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] < character_cnt)]
329
- df_reco_opt_rank = df_reco_opt.nlargest(3,[selected_variable])
330
-
331
 
332
  if selected_variable == "Open_Rate":
333
  selected_variable = "Click-to-Open_Rate"
334
  if selected_variable == "Click_Through_Rate":
335
  selected_variable = "Conversion_Rate"
336
 
337
- st.markdown('#### To get higher, <span style="color:blue">{}</span>, the model recommends the following options:'.format(selected_variable),unsafe_allow_html=True)
 
338
  if len(df_reco_opt_rank) == 0:
339
- st.markdown('#### You ve already achieved the highest, <span style="color:blue">{}</span>, with the current character count!'.format(selected_variable),unsafe_allow_html=True)
 
340
  else:
341
  for _, row in df_reco_opt_rank.iterrows():
342
  Character_Count = row[1]
343
  selected_variable = row[3]
344
  # print(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(selected_variable, 3)*100}", "%")
345
- st.markdown('Number of Characters: {}, Target Rate: {}'.format(int(Character_Count), round(selected_variable*100, 3)))
346
-
347
 
348
  placeholder.empty()
349
- # print(time.time() - start_time)
 
39
  ]
40
 
41
  data = {
42
+ 'Field': field,
43
+ 'Data': data
44
  }
45
 
46
  df = pd.DataFrame.from_dict(data)
 
48
  return df
49
 
50
 
51
+ def url_button(button_name, url):
 
52
  if st.button(button_name):
53
+ js = """window.open('{url}')""".format(url=url) # New tab or window
54
  html = '<img src onerror="{}">'.format(js)
55
  div = Div(text=html)
56
  st.bokeh_chart(div)
57
 
58
+
59
  def get_industry_code_dict(training_dataset):
60
+ training_dataset['industry_code'] = training_dataset['industry'].astype(
61
+ 'category')
62
  cat_columns = training_dataset.select_dtypes(['category']).columns
63
+ training_dataset[cat_columns] = training_dataset[cat_columns].apply(
64
+ lambda x: x.cat.codes)
65
+ industry_code_dict = dict(
66
+ zip(training_dataset.industry, training_dataset.industry_code))
67
  return industry_code_dict
68
 
69
 
70
+ # extract email body from parse email
71
+ def email_body_extractor(email_data):
72
  # email_data = parsed_email.data[0]
73
+ emailstr = email_data.decode("utf-8")
74
  b = email.message_from_string(emailstr)
75
  body = ""
76
 
 
86
  # not multipart - i.e. plain text, no attachments, keeping fingers crossed
87
  else:
88
  body = b.get_payload()
89
+ # Remove escape sequences
90
  body = body.replace('\n', '')
91
  body = body.replace('\t', '')
92
  body = body.replace('\r', '')
93
  body = body.replace('</b>', '')
94
  body = body.replace('<b>', '')
95
 
96
+ # Extract urls in the email body and get url counts
 
97
  extractor = URLExtract()
98
  urls = extractor.find_urls(body)
99
  url_cnt = len(urls)
100
+ # Remove urls
101
+ body = re.sub(
102
+ r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', body)
103
  sep = '©'
104
  body = body.split(sep, 1)[0]
105
  character_cnt = sum(not chr.isspace() for chr in body)
 
109
 
110
  def add_bg_from_url():
111
  st.markdown(
112
+ f"""
113
  <style>
114
  .stApp {{
115
  background-image: linear-gradient(#0A3144,#126072,#1C8D99);
 
118
  }}
119
  </style>
120
  """,
121
+ unsafe_allow_html=True
122
+ )
123
+
124
+ # add_bg_from_url()
125
 
 
126
 
127
  st.markdown("# Character Count: Email Industry")
128
 
 
132
  # img = PIL.Image.open("figures/ModelCC_solid.png")
133
  # st.image(img)
134
 
135
+ stats_col1, stats_col2, stats_col3, stats_col4 = st.columns([1, 1, 1, 1])
136
 
137
  with stats_col1:
138
  st.metric(label="Production", value="Production")
 
146
  st.metric(label="Industry", value="Email")
147
 
148
 
 
149
  with st.sidebar:
150
 
151
  with st.expander('Model Description', expanded=False):
 
163
  st.markdown(hide_table_row_index, unsafe_allow_html=True)
164
  st.table(table_data())
165
 
166
+ url_button('Model Homepage', 'https://www.loxz.com/#/models/CTA')
167
  # url_button('Full Report','https://resources.loxz.com/reports/realtime-ml-character-count-model')
168
+ url_button('Amazon Market Place', 'https://aws.amazon.com/marketplace')
169
 
170
 
171
  industry_lists = [
 
181
  ]
182
 
183
  campaign_types = [
184
+ 'Promotional',
185
+ 'Transactional',
186
+ 'Webinar',
187
+ 'Survey',
188
+ 'Newsletter',
189
  'Engagement',
190
+ 'Curated_Content',
191
+ 'Review_Request',
192
+ 'Product_Announcement',
193
  'Abandoned_Cart'
194
  ]
195
 
196
  target_variables = [
197
+ 'conversion_rate',
198
  'click_to_open_rate'
199
  ]
200
 
201
+ uploaded_file = st.file_uploader(
202
+ "Please upload your email (In HTML Format)", type=["html"])
203
 
204
  if uploaded_file is None:
205
  # upload_img = PIL.Image.open(uploaded_file)
 
214
  index=6
215
  )
216
 
217
+ campaign = st.selectbox(
218
+ 'Please select your campaign type',
219
  campaign_types,
220
  index=5
221
  )
 
234
  index=1)
235
 
236
 
237
+ def get_files_from_aws(bucket, prefix):
238
  """
239
  get files from aws s3 bucket
240
+
241
  bucket (STRING): bucket name
242
  prefix (STRING): file location in s3 bucket
243
  """
244
  s3_client = boto3.client('s3',
245
+ aws_access_key_id=st.secrets["aws_id"],
246
+ aws_secret_access_key=st.secrets["aws_key"])
247
 
248
+ file_obj = s3_client.get_object(Bucket=bucket, Key=prefix)
249
  body = file_obj['Body']
250
  string = body.read().decode('utf-8')
251
+
252
  df = pd.read_csv(StringIO(string))
253
+
254
+ return df
255
 
256
 
257
  # st.info([industry,campaign,target,char_reco_preference])
 
268
  # Starting predictions
269
  model = joblib.load('models/models.sav')
270
  # Generate Email Data
271
+ email_data = get_files_from_aws(
272
+ 'emailcampaigntrainingdata', 'trainingdata/email_dataset_training.csv')
273
+ acc_data = get_files_from_aws(
274
+ 'emailcampaigntrainingdata', 'trainingdata/email_dataset_training_raw.csv')
275
+
276
+ email_data_ = email_data[["email_body", "industry", "campaign_type",
277
+ "character_cnt", "url_cnt", "Open_Rate", "Click_Through_Rate"]]
278
+ email_data_ = email_data_.rename(
279
+ {'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})
280
+ df_email_data = email_data_.rename(
281
+ columns={'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})
282
 
283
  # Dataset:
284
+ training_dataset = get_files_from_aws(
285
+ 'emailcampaigntrainingdata', 'modelCC/training.csv')
286
  # X_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/Xtest.csv')
287
  # Y_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/ytest.csv')
288
 
289
  # print("Getting Data Time: %s seconds" % (time.time() - start_time))
290
+
291
  industry_code_dict = get_industry_code_dict(email_data)
292
  bytes_data = uploaded_file.getvalue()
293
 
 
297
  # Need to solve X test issue
298
 
299
  # y_pred = model.predict(X_test)
300
+ df_uploaded = pd.DataFrame(
301
+ columns=['character_cnt', "url_cnt", "industry"])
302
  df_uploaded.loc[0] = [character_cnt, url_cnt, industry]
303
  df_uploaded["industry_code"] = industry_code_dict.get(industry)
304
+ df_uploaded_test = df_uploaded[[
305
+ "industry_code", "character_cnt", "url_cnt"]]
306
+ predicted_rate = model.predict(df_uploaded_test)[0]
307
+ output_rate = round(predicted_rate, 4)
308
 
309
  if output_rate < 0:
310
+ print(
311
+ "Sorry, Current model couldn't provide predictions on the target variable you selected.")
312
  else:
313
+ st.markdown('#### Current Character Count in Your Email is: <span style="color:blue">{}</span>'.format(
314
+ character_cnt), unsafe_allow_html=True)
315
  # st.info('The model predicts that it achieves a {} of {}%'.format(target, str(round(output_rate*100,2))))
316
  if target == 'conversion_rate':
317
  target_vis = 'Conversion Rate'
318
  else:
319
  target_vis = 'Click-to-Open Rate'
320
+ st.markdown('#### The model predicts that it achieves a <span style="color:blue">{}</span> of <span style="color:blue">{}</span>%'.format(
321
+ target_vis, str(round(output_rate*100, 2))), unsafe_allow_html=True)
322
  selected_industry_code = industry_code_dict.get(industry)
323
 
324
  if target == "click_to_open_rate":
 
326
  if target == "conversion_rate":
327
  selected_variable = "Click_Through_Rate"
328
 
329
+ df_reco = training_dataset[[
330
+ "industry_code", "character_cnt", "url_cnt", selected_variable]]
331
+ df_reco = df_reco[df_reco["industry_code"]
332
+ == selected_industry_code]
333
+ df_reco[selected_variable] = df_reco[selected_variable].apply(
334
+ lambda x: round(x, 3))
335
  df_reco_sort = df_reco.sort_values(by=[selected_variable])
336
  df_reco = df_reco.drop_duplicates(subset=selected_variable)
337
 
338
  preference = char_reco_preference
339
  if preference == "Increase":
340
+ df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
341
+ df_reco["character_cnt"] > character_cnt) & (df_reco["character_cnt"] <= (1.5*character_cnt))]
342
+ df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
343
+ # decrease character reco
344
  if preference == "Decrease":
345
+ df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
346
+ df_reco["character_cnt"] < character_cnt)]
347
+ df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
348
 
349
  if selected_variable == "Open_Rate":
350
  selected_variable = "Click-to-Open_Rate"
351
  if selected_variable == "Click_Through_Rate":
352
  selected_variable = "Conversion_Rate"
353
 
354
+ st.markdown('#### To get higher, <span style="color:blue">{}</span>, the model recommends the following options:'.format(
355
+ selected_variable), unsafe_allow_html=True)
356
  if len(df_reco_opt_rank) == 0:
357
+ st.markdown('#### You ve already achieved the highest, <span style="color:blue">{}</span>, with the current character count!'.format(
358
+ selected_variable), unsafe_allow_html=True)
359
  else:
360
  for _, row in df_reco_opt_rank.iterrows():
361
  Character_Count = row[1]
362
  selected_variable = row[3]
363
  # print(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(selected_variable, 3)*100}", "%")
364
+ st.markdown('Number of Characters: {}, Target Rate: {}'.format(
365
+ int(Character_Count), round(selected_variable*100, 3)))
366
 
367
  placeholder.empty()
368
+ # print(time.time() - start_time)