Spaces:

loxzdigital
/

Model-CC-Space

Runtime error

Model-CC-Space / app.py

Chandan Dwivedi

fixed issues

660607d almost 2 years ago

24.4 kB

	from ast import arg
	import streamlit as st
	import pandas as pd
	import PIL
	from urlextract import URLExtract
	import time
	from utils import *

	# from joblib import dump, load

	import joblib

	from bokeh.models.widgets import Div

	import email
	import os
	#from ipyfilechooser import FileChooser

	#from IPython.display import display
	from bs4 import BeautifulSoup
	import matplotlib.pyplot as plt
	import numpy as np
	import timeit
	import shutil

	CURRENT_THEME = "blue"
	IS_DARK_THEME = True

	def table_data():
	# creating table data
	field = [
	'Data Scientist',
	'Dataset',
	'Algorithm',
	'Framework',
	'Ensemble',
	'Domain',
	'Model Size'
	]

	data = [
	'Chen Song',
	'Internal + Campaign monitor',
	'Random Forest',
	'Sci-kit learn',
	'Bootstrapping',
	'Bootstrapping Aggregation',
	'4 KB'
	]

	data = {
	'Field': field,
	'Data': data
	}

	df = pd.DataFrame.from_dict(data)

	return df


	def url_button(button_name, url):
	if st.button(button_name):
	js = """window.open('{url}')""".format(url=url) # New tab or window
	html = '<img src onerror="{}">'.format(js)
	div = Div(text=html)
	st.bokeh_chart(div)


	def get_industry_code_dict(training_dataset):
	training_dataset['industry_code'] = training_dataset['industry'].astype(
	'category')
	cat_columns = training_dataset.select_dtypes(['category']).columns
	training_dataset[cat_columns] = training_dataset[cat_columns].apply(
	lambda x: x.cat.codes)
	industry_code_dict = dict(
	zip(training_dataset.industry, training_dataset.industry_code))
	return industry_code_dict

	def parse_email(uploaded_file):
	parsed_email = []
	efile = open(uploaded_file.name,'r')
	emailstr = ""
	for i, line in enumerate(efile):
	emailstr += line

	b = email.message_from_string(emailstr)
	for part in b.walk():
	if part.get_content_type():
	body = str(part.get_payload())
	soup = BeautifulSoup(body)
	paragraphs = soup.find_all('body')
	for paragraph in paragraphs:
	parsed_email.append(paragraph.text)
	return parsed_email

	#def email_upload():
	# print("Please upload your email (In HTML Format)")
	# upload = FileUpload(accept='.html', multiple=True)
	# display(upload)
	# return upload
	# fc = FileChooser()
	# display(fc)
	# return fc


	# New - In-Use
	def email_extractor(email_uploaded):
	parse = parse_email(email_uploaded)

	email_text = ''.join(parse).strip()

	# extract the email body using string manipulation functions
	email_body_start_index = email_text.find('Bright Apps LLC')
	email_body_end_index = email_text.find('To read more')
	email_body = email_text[email_body_start_index:email_body_end_index].strip()

	# get rid of non-text elements
	email_body = email_body.replace('\n', '')
	email_body = email_body.replace('\t', '')
	email_body = email_body.replace('\r', '')
	email_body = email_body.replace('</b>', '')
	email_body = email_body.replace('<b>', '')
	email_body = email_body.replace('\xa0', '')

	# find length of URLs if any
	extractor = URLExtract()
	urls = extractor.find_urls(email_body)
	url_cnt = len(urls)

	# remove URLs and get character count
	body = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)(?:(?:\/[^\s/]))*', '', email_body)
	sep = '©'
	body = body.split(sep, 1)[0]
	character_cnt = sum(not chr.isspace() for chr in body)

	return email_body, character_cnt, url_cnt

	def email_extractor_general(email_uploaded):
	parse = parse_email(email_uploaded)
	email_text = ''.join(parse).strip()

	# get rid of non-text elements
	email_text = email_text.replace('\n', '')
	email_text = email_text.replace('\t', '')
	email_text = email_text.replace('\r', '')
	email_text = email_text.replace('</b>', '')
	email_text = email_text.replace('<b>', '')
	email_text = email_text.replace('\xa0', '')

	# find length of URLs if any
	extractor = URLExtract()
	urls = extractor.find_urls(email_text)
	url_cnt = len(urls)

	# remove URLs and get character count
	body = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)(?:(?:\/[^\s/]))*', '', email_text)
	sep = '©'
	body = body.split(sep, 1)[0]
	character_cnt = sum(not chr.isspace() for chr in body)

	return email_text, character_cnt, url_cnt


	# extract email body from parse email
	def email_body_extractor(email_data):
	# email_data = parsed_email.data[0]
	emailstr = email_data.decode("utf-8")
	b = email.message_from_string(emailstr)
	body = ""

	if b.is_multipart():
	for part in b.walk():
	ctype = part.get_content_type()
	cdispo = str(part.get('Content-Disposition'))

	# skip any text/plain (txt) attachments
	if ctype == 'text/plain' and 'attachment' not in cdispo:
	body = part.get_payload() # decode
	break
	# not multipart - i.e. plain text, no attachments, keeping fingers crossed
	else:
	body = b.get_payload()
	# Remove escape sequences
	body = body.replace('\n', '')
	body = body.replace('\t', '')
	body = body.replace('\r', '')
	body = body.replace('</b>', '')
	body = body.replace('<b>', '')

	# Extract urls in the email body and get url counts
	extractor = URLExtract()
	urls = extractor.find_urls(body)
	url_cnt = len(urls)
	# Remove urls
	body = re.sub(
	r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)(?:(?:\/[^\s/]))*', '', body)
	sep = '©'
	body = body.split(sep, 1)[0]
	character_cnt = sum(not chr.isspace() for chr in body)

	return body, character_cnt, url_cnt


	def add_bg_from_url():
	st.markdown(
	f"""
	<style>
	.stApp {{
	background-image: linear-gradient(135deg,#061c2c,#084e69 35%,#3e7e89);
	background-attachment: fixed;
	background-size: cover

	}}
	</style>
	""",
	unsafe_allow_html=True
	)

	add_bg_from_url()
	#linear-gradient(0deg,#010405 0,#061c2c 55%,#0a3144 75%,#0f4d60)

	st.markdown("# Character Count: Email Industry")


	stats_col1, stats_col2, stats_col3, stats_col4 = st.columns([1, 1, 1, 1])

	with stats_col1:
	st.caption("Production: Ready")
	with stats_col2:
	st.caption("Accuracy: 85%")
	with stats_col3:
	st.caption("Speed: 16.89 ms")
	with stats_col4:
	st.caption("Industry: Email")


	with st.sidebar:

	with st.expander('Model Description', expanded=False):
	img = PIL.Image.open("figures/ModelCC.png")
	st.image(img)
	st.markdown('Finding the correct length for an email campaign to maximize user engagement can be an ambiguous task. The Loxz Character Count Model allows you to predict the correct length of your emails for a particular industry and a particular type of email. Using these inputs and trained on an extensive proprietary data set from the Loxz family digital archive, the models incorporate real-world and synthetic data to find the optimized character counts. We applied the random forest algorithm in this model. Bootstrapping was also ensembled in the algorithm which effectively prevents overfitting by reducing variance. The model achieves an 86% accuracy on the test set. This inference-based ML model will help the campaign engineers start with an acceptable length and zero in on the best character count, maximizing engagement in their campaign.')

	with st.expander('Model Information', expanded=False):
	hide_table_row_index = """
	<style>
	thead tr th:first-child {display:none}
	tbody th {display:none}
	</style>
	"""
	st.markdown(hide_table_row_index, unsafe_allow_html=True)
	st.table(table_data())

	url_button('Model Homepage', 'https://www.loxz.com/#/models/CTA')
	# url_button('Full Report','https://resources.loxz.com/reports/realtime-ml-character-count-model')
	url_button('Amazon Market Place', 'https://aws.amazon.com/marketplace')


	industry_lists = [
	'Retail',
	'Software and Technology',
	'Hospitality',
	'Academic and Education',
	'Healthcare',
	'Energy',
	'Real Estate',
	'Entertainment',
	'Finance and Banking'
	]

	campaign_types = [
	'Promotional',
	'Transactional',
	'Webinar',
	'Survey',
	'Newsletter',
	'Engagement',
	'Usage_and_Consumption',
	'Review_Request',
	'Product_Announcement',
	'Abandoned_Cart'
	]

	target_variables = [
	'conversion_rate',
	'click_to_open_rate',
	# 'Bounce Rate',
	# 'Spam Complaint Rate',
	# 'AOV',
	# 'CLV',
	# 'ROI',
	# 'NPS',
	# 'CAC',
	# 'Abandonment Rate',
	# 'Site Traffic',
	# 'Product Return Rate',
	# 'Net Profit Margin',
	# 'MRR',
	# 'ARR',
	# 'Churn',
	# 'ARPU',
	# 'Retention Rate',
	# 'Unsubscribe Rate',
	# 'Email ROI'
	]

	uploaded_file = st.file_uploader(
	"Please upload your email (In HTML Format)", type=["html"])

	def save_file(uploaded_file):
	with open(os.path.join("./",uploaded_file.name),"wb") as f:
	f.write(uploaded_file.getbuffer())

	if uploaded_file is None:
	# upload_img = PIL.Image.open(uploaded_file)
	upload_img = None
	# else:
	# upload_img = None


	industry = st.selectbox(
	'Please select your industry',
	industry_lists,
	index=6
	)

	campaign = st.selectbox(
	'Please select your campaign type',
	campaign_types,
	index=5
	)

	target = st.selectbox(
	'Please select your target variable',
	target_variables,
	index=1
	)

	st.markdown("""---""")

	#char_reco_preference = st.selectbox(
	# 'Do you want to increase or decrease your character count in the email?',
	# ["Increase", "Decrease"],
	# index=1)



	# st.info([industry,campaign,target,char_reco_preference])

	act=st.button('Generate Predictions')
	if st.session_state.get('button') != True:
	st.session_state['button'] = act

	# if act:
	if st.session_state.get('button') == True:
	start_time = time.time()
	if uploaded_file is None:
	st.error('Please upload a email (HTML format)')
	else:
	save_file(uploaded_file)
	placeholder = st.empty()
	placeholder.text('Loading Data')

	# Starting predictions
	model = joblib.load('models/models.sav')
	# Generate Email Data
	email_data = get_files_from_aws(
	'emailcampaigntrainingdata', 'trainingdata/email_dataset_training.csv')
	acc_data = get_files_from_aws(
	'emailcampaigntrainingdata', 'trainingdata/email_dataset_training_raw.csv')

	email_data_ = email_data[["email_body", "industry", "campaign_type",
	"character_cnt", "url_cnt", "Open_Rate", "Click_Through_Rate"]]
	email_data_ = email_data_.rename(
	{'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})
	df_email_data = email_data_.rename(
	columns={'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})

	# Dataset:
	training_dataset = get_files_from_aws(
	'emailcampaigntrainingdata', 'modelCC/training.csv')
	# X_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/Xtest.csv')
	# Y_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/ytest.csv')

	# print("Getting Data Time: %s seconds" % (time.time() - start_time))

	industry_code_dict = get_industry_code_dict(email_data)
	#uploaded_file = FileChooser(uploaded_file)
	#bytes_data = uploaded_file.getvalue()

	email_body, character_cnt, url_cnt = email_extractor_general(uploaded_file)

	# Start the prediction
	# Need to solve X test issue

	# y_pred = model.predict(X_test)
	df_uploaded = pd.DataFrame(
	columns=['character_cnt', "url_cnt", "industry"])
	df_uploaded.loc[0] = [character_cnt, url_cnt, industry]
	df_uploaded["industry_code"] = industry_code_dict.get(industry)
	df_uploaded_test = df_uploaded[[
	"industry_code", "character_cnt", "url_cnt"]]
	predicted_rate = model.predict(df_uploaded_test)[0]
	output_rate = round(predicted_rate, 4)

	if output_rate < 0:
	print(
	"Sorry, Current model couldn't provide predictions on the target variable you selected.")
	else:
	st.markdown('##### Current Character Count in Your Email is: <span style="color:yellow">{}</span>'.format(
	character_cnt), unsafe_allow_html=True)
	# st.info('The model predicts that it achieves a {} of {}%'.format(target, str(round(output_rate*100,2))))
	if target == 'conversion_rate':
	target_vis = 'Click_Through_Rate'
	else:
	target_vis = 'Open_Rate'

	st.markdown('##### The model predicts that it achieves a <span style="color:yellow">{}</span> of <span style="color:yellow">{}</span>%'.format(
	target_vis, str(round(output_rate*100, 3))), unsafe_allow_html=True)
	selected_industry_code = industry_code_dict.get(industry)

	if target == "click_to_open_rate":
	selected_variable = "Open_Rate"
	if target == "conversion_rate":
	selected_variable = "Click_Through_Rate"

	df_reco = training_dataset[[
	"industry_code", "character_cnt", "url_cnt", selected_variable]]
	df_reco = df_reco[df_reco["industry_code"]
	== selected_industry_code]
	df_reco[selected_variable] = df_reco[selected_variable].apply(
	lambda x: round(x, 3))
	df_reco_sort = df_reco.sort_values(by=[selected_variable])
	df_reco = df_reco.drop_duplicates(subset=selected_variable)

	#preference = char_reco_preference
	#if preference == "Increase":
	# df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
	# df_reco["character_cnt"] > character_cnt) & (df_reco["character_cnt"] <= (1.5*character_cnt))]
	# df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
	# decrease character reco
	#if preference == "Decrease":
	# df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
	# df_reco["character_cnt"] < character_cnt)]
	# df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])


	# split into two dataframes of higher and lower character_cnt (added apr 2023)
	char_cnt_uploaded = character_cnt

	df_reco_opt1 = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] > char_cnt_uploaded) & (df_reco["character_cnt"] <= (1.5*char_cnt_uploaded))]
	df_reco_opt2 = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] < char_cnt_uploaded) & (df_reco["character_cnt"] >= (char_cnt_uploaded/2))]

	# drop duplicates of character_cnt keeping the row with the highest output_rate
	df_reco_opt1 = df_reco_opt1.sort_values(by=[selected_variable], ascending=False).drop_duplicates(subset=["character_cnt"])
	df_reco_opt2 = df_reco_opt2.sort_values(by=[selected_variable], ascending=False).drop_duplicates(subset=["character_cnt"])

	# get top 2 largest in higher and lower dataframe
	df_reco_opt_rank1 = df_reco_opt1.nlargest(2, [selected_variable])
	df_reco_opt_rank2 = df_reco_opt2.nlargest(2, [selected_variable])

	df_reco_opt_rank = pd.concat([df_reco_opt_rank1, df_reco_opt_rank2])
	df_reco_opt_rank = df_reco_opt_rank.nlargest(3,[selected_variable])

	if selected_variable == "Open_Rate":
	selected_variable = "Click-to-Open_Rate"
	if selected_variable == "Click_Through_Rate":
	selected_variable = "Conversion_Rate"

	st.markdown('##### To get higher, <span style="color:yellow">{}</span>, the model recommends the following options:'.format(
	selected_variable), unsafe_allow_html=True)
	if len(df_reco_opt_rank) == 0:
	st.markdown('##### You ve already achieved the highest, <span style="color:yellow">{}</span>, with the current character count!'.format(
	selected_variable), unsafe_allow_html=True)
	else:
	#for _, row in df_reco_opt_rank.iterrows():
	# Character_Count = row[1]
	# selected_variable = row[3]
	# print(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(selected_variable, 3)*100}", "%")
	# st.markdown('Number of Characters: {}, Target Rate: {}'.format(
	# int(Character_Count), round(selected_variable*100, 3)))

	chars = []
	sel_var_values = []

	for _, row in df_reco_opt_rank.iterrows():
	Character_Count = row[1]
	selected_variable_number = row[3]
	chars.append(int(Character_Count))
	sel_var_values.append(round(selected_variable_number, 3)*100)
	# st.write(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(round(selected_variable_number, 3)*100, 3)}", "%")
	st.write("\n")
	df_modelpred=pd.DataFrame(list(zip(chars, sel_var_values)), columns=["Number of Characters", "Target_Rate"])
	# st.checkbox("Use container width", value=False, key="use_container_width")
	# st.dataframe(df_modelpred.style.highlight_max(axis=0), use_container_width=st.session_state.use_container_width)
	df_modelpred.sort_values(by='Target_Rate', ascending=False, inplace = True)
	st.dataframe(df_modelpred)

	if len(chars) > 1:
	#fig = plt.figure()
	#ax = fig.add_axes([0,0,1,1])
	fig, ax = plt.subplots(figsize=(10,4))
	bars = ax.barh(np.arange(len(chars)), sel_var_values, height=0.175, color='#0F4D60')

	#ax.bar_label(bars)
	ax.tick_params(colors='w', which='both')
	ax.set_yticks(np.arange(len(chars)))
	ax.set_yticklabels(tuple(chars), fontsize=14)
	ax.set_title('Character Counts vs. Target Variable Rates', fontsize=18, color='y')
	ax.set_ylabel('Character Counts', fontsize=16, color='y')
	ax.set_xlabel('Target Rates %', fontsize=16, color='y')

	for i, bar in enumerate(bars):
	rounded_value = round(sel_var_values[i], 2)
	ax.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height()/2, str(rounded_value) + '%', ha='left', va='center', fontsize=12, fontweight='bold', color='y')

	ax.margins(0.1,0.05)

	biggest_bar_index = np.argmax(sel_var_values)
	bars[biggest_bar_index].set_color('#00BF93')

	st.plotly_chart(fig, use_container_width=True)

	# st.write("\n")
	chars_out = dict(zip(chars, sel_var_values))
	sorted_chars_out = sorted(chars_out.items(), key=lambda x: x[1], reverse=True)
	prefrence_variables=["charcter counts: "+str(x)+", Target Rate: "+str(y) for x,y in zip(chars,sel_var_values)]
	# prefrence_variables=[None]+prefrence_variables
	preference = st.selectbox(
	'Please select your preferences for target metric',
	prefrence_variables,
	index=0
	)
	options = st.multiselect(
	'Select prompts you want to use to generate your email:',
	["Convey key message in fewer words",
	"Rephrase sentences to be more concise",
	"Remove unnecessary details/repetitions",
	"Use bullet points or numbered lists",
	"Include clear call-to-action in the email",
	"Link to information instead of writing it out",
	"Shorten the subject line",
	"Replace technical terms with simpler language"],
	None)
	# st.markdown('preference: {}, len preference: '.format(preference, len(preference)),unsafe_allow_html=True)
	st.markdown('options: {}'.format(options),unsafe_allow_html=True)


	if st.button('Generate AI Recommended Email'):
	if(preference is None and options is None):
	st.error('Please select your preferences.')
	else:
	stats_col1, stats_col2, stats_col3, stats_col4 = st.columns([1, 1, 1, 1])
	with stats_col1:
	st.caption("Production: Ready")
	with stats_col2:
	st.caption("Accuracy: 85%")
	with stats_col3:
	st.caption("Speed: 16.89 ms")
	with stats_col4:
	st.caption("Industry: Email")
	if(options==None):
	if(preference):
	ai_generated_email=generate_example_email_with_context(email_body, campaign, industry, target, sorted_chars_out, preference)
	st.markdown('##### Here is the recommended Generated Email for you:')
	with st.expander('', expanded=True):
	st.markdown('{}'.format(ai_generated_email),unsafe_allow_html=True)
	else:
	email_body_opt=email_body
	if(preference is not ''):
	# st.markdown('##### preference is selected')
	ai_generated_email=generate_example_email_with_context(email_body, campaign, industry, target, sorted_chars_out, preference)
	email_body_opt=ai_generated_email
	optimized_email, optimized_char_cnt, optimized_url_cnt = optimize_email_prompt_multi(email_body_opt, options)
	charc, tmval=get_optimized_prediction("sagemakermodelcc", "modelCC.sav", "sagemakermodelcc", target, industry,
	optimized_char_cnt, optimized_url_cnt, industry_code_dict)
	st.markdown('##### Current Character Count in Your Optimized Email is: <span style="color:yellow">{}</span>'.format(charc), unsafe_allow_html=True)
	st.markdown('##### The model predicts that it achieves a <span style="color:yellow">{}</span> of <span style="color:yellow">{}</span>%'.format(target,tmval), unsafe_allow_html=True)
	st.markdown('##### Here is the recommended Generated Email for you:')
	with st.expander('', expanded=True):
	st.markdown('{}'.format(optimized_email),unsafe_allow_html=True)

	# st.session_state['button'] = False
	# preference= "character counts: "+str(573)+", Target Rate: "+str(37.2)
	# ai_generated_email=generate_example_email_with_context(email_body, campaign, industry, target, sorted_chars_out, preference)
	# print("ai_generated_email: ",ai_generated_email)
	# st.markdown('##### Here is the recommended Generated Email for you:')
	# st.markdown('####### {}'.format(ai_generated_email),unsafe_allow_html=True)
	#st.write(np.array(chars))


	# chars_out = dict(zip(chars, sel_var_values))
	# sorted_chars_out = sorted(chars_out.items(), key=lambda x: x[1], reverse=True)


	# placeholder.empty()
	#st.write(time.time() - start_time)