File size: 5,498 Bytes
eb1f440
 
 
 
 
93ba032
eb1f440
 
 
 
 
 
9ce0485
 
 
 
4b388b8
ed0c441
616896b
ed0c441
eb1f440
 
 
 
 
c1478a9
 
93ba032
c1478a9
 
 
93ba032
c1478a9
 
 
 
 
93ba032
c1478a9
 
93ba032
c1478a9
 
 
 
 
93ba032
c1478a9
 
 
 
93ba032
c1478a9
 
93ba032
c1478a9
 
 
 
93ba032
c1478a9
 
eb1f440
c1478a9
 
 
eb1f440
c1478a9
 
5c2fff1
 
 
 
 
 
c1478a9
5c2fff1
93ba032
 
 
 
 
 
42b052d
93ba032
 
 
 
 
 
5c2fff1
5f22042
b004973
4b388b8
5c2fff1
7b32eb4
 
 
 
4b388b8
93ba032
 
 
 
eb1f440
5c2fff1
 
b21d5e0
7415061
848851d
5c2fff1
 
 
b21d5e0
4b388b8
eb1f440
5c2fff1
92d7932
 
 
b21d5e0
eb1f440
 
 
 
5c2fff1
 
8f299bb
eb1f440
 
 
8f299bb
eb1f440
8f299bb
eb1f440
8f299bb
eb1f440
 
 
 
4b51a5a
eb1f440
52855ba
 
5c2fff1
 
 
52855ba
cb33fad
 
f5761ae
cb33fad
58fbefd
 
eb1f440
 
 
 
 
 
 
 
 
 
 
58fbefd
eb1f440
 
58fbefd
680cff7
eb1f440
0a00489
4b51a5a
eb1f440
680cff7
0e8c5c1
9713997
0b93644
df169d5
32aefbe
8e2b9e5
220313a
b7d5afb
e2e4921
cb33fad
f5761ae
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
### ----------------------------- ###
###           libraries           ###
### ----------------------------- ###

import streamlit as st
import pickle as pkl
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


### ----------------------------- ###
###        interface setup        ###
### ----------------------------- ###

with open('styles.css') as f:
  st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)


### ------------------------------ ###
###       data transformation      ###
### ------------------------------ ###

# load dataset
uncleaned_data = pd.read_csv('data.csv')
  
# remove timestamp from dataset (always first column)
uncleaned_data = uncleaned_data.iloc[: , 1:]
data = pd.DataFrame()
  
# keep track of which columns are categorical and what 
# those columns' value mappings are
# structure: {colname1: {...}, colname2: {...} }
cat_value_dicts = {}
final_colname = uncleaned_data.columns[len(uncleaned_data.columns) - 1]
  
# for each column...
for (colname, colval) in uncleaned_data.iteritems():
  
  # check if col is already a number; if so, add col directly
  # to new dataframe and skip to next column
  if isinstance(colval.values[0], (np.integer, float)):
    data[colname] = uncleaned_data[colname].copy()
    continue
  
  # structure: {0: "lilac", 1: "blue", ...}
  new_dict = {}
  val = 0 # first index per column
  transformed_col_vals = [] # new numeric datapoints
  
  # if not, for each item in that column...
  for (row, item) in enumerate(colval.values):
      
    # if item is not in this col's dict...
    if item not in new_dict:
      new_dict[item] = val
      val += 1
      
    # then add numerical value to transformed dataframe
    transformed_col_vals.append(new_dict[item])
    
  # reverse dictionary only for final col (0, 1) => (vals)
  if colname == final_colname:
    new_dict = {value : key for (key, value) in new_dict.items()}
  
  cat_value_dicts[colname] = new_dict
  data[colname] = transformed_col_vals


### -------------------------------- ###
###           model training         ###
### -------------------------------- ###

def train_model():
  # select features and prediction; automatically selects last column as prediction
  cols = len(data.columns)
  num_features = cols - 1
  x = data.iloc[: , :num_features]
  y = data.iloc[: , num_features:]
  
  # split data into training and testing sets
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
  
  # instantiate the model (using default parameters)
  model = LogisticRegression()
  model.fit(x_train, y_train.values.ravel())
  y_pred = model.predict(x_test)
  
  # save the model to file using the pickle package
  with open('model.pkl', 'wb') as f:
    pkl.dump(model, f)
  
  # save model accuracy to file using the pickle package
  with open('acc.txt', 'w+') as f:
    acc = metrics.accuracy_score(y_test, y_pred)
    f.write(str(round(acc * 100, 1)) + '%')
    
  return model

### -------------------------------- ###
###            rerun logic           ###
### -------------------------------- ###

# check to see if this is the first time running the script,
# if the model has already been trained and saved, load it
try:
  with open('model.pkl', 'rb') as f:
    model = pkl.load(f)
    
# if this is the first time running the script, train the model
# and save it to the file model.pkl
except FileNotFoundError as e:
  model = train_model()

# read the model accuracy from file
with open('acc.txt', 'r') as f:
  acc = f.read()


### ------------------------------- ###
###        interface creation       ###
### ------------------------------- ###

# uses the logistic regression to predict for a generic number
# of features
def general_predictor(input_list):
  features = []

  # transform categorical input
  for colname, input in zip(data.columns, input_list):
    if (colname in cat_value_dicts):
      features.append(cat_value_dicts[colname][input])
    else:
      features.append(input)

  # predict single datapoint
  new_input = [features]
  result = model.predict(new_input)
  return cat_value_dicts[final_colname][result[0]]

def get_feat():
  feats = [abs(x) for x in model.coef_[0]]
  max_val = max(feats)
  idx = feats.index(max_val)
  return data.columns[idx]

with open('info.md') as f:
  st.title(f.readline())
  st.subheader('Take the quiz to get a personalized recommendation using AI.')

form = st.form('ml-inputs')

# add data labels to replace those lost via star-args
inputls = []
for colname in data.columns:
  # skip last column
  if colname == final_colname:
    continue

  # access categories dict if data is categorical
  # otherwise, just use a number input
  if colname in cat_value_dicts:
    radio_options = list(cat_value_dicts[colname].keys())
    inputls.append(form.selectbox(colname, radio_options))
  else:
    # add numerical input
    inputls.append(form.number_imput(colname))

# generate gradio interface
if form.form_submit_button("Submit to get your recommendation!"):
  prediction = general_predictor(inputls)
  
  form.subheader(prediction)

col1, col2 = st.columns(2)
col1.metric("Number of Different Possible Results", len(cat_value_dicts[final_colname]))
col2.metric("Model Accuracy",  acc)
st.metric("Most Important Question", "")
st.subheader(get_feat())
st.markdown("***")

with open('info.md') as f:
  f.readline()
  st.markdown(f.read())