Isabel Gwara commited on
Commit
eb1f440
·
1 Parent(s): df9d9a9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -0
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### ----------------------------- ###
2
+ ### libraries ###
3
+ ### ----------------------------- ###
4
+
5
+ import streamlit as st
6
+ import pandas as pd
7
+ import numpy as np
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.linear_model import LogisticRegression
10
+ from sklearn import metrics
11
+
12
+
13
+ ### ------------------------------ ###
14
+ ### data transformation ###
15
+ ### ------------------------------ ###
16
+
17
+ # load dataset
18
+ uncleaned_data = pd.read_csv('data.csv')
19
+
20
+ # remove timestamp from dataset (always first column)
21
+ uncleaned_data = uncleaned_data.iloc[: , 1:]
22
+ data = pd.DataFrame()
23
+
24
+ # keep track of which columns are categorical and what
25
+ # those columns' value mappings are
26
+ # structure: {colname1: {...}, colname2: {...} }
27
+ cat_value_dicts = {}
28
+ final_colname = uncleaned_data.columns[len(uncleaned_data.columns) - 1]
29
+
30
+ # for each column...
31
+ for (colname, colval) in uncleaned_data.iteritems():
32
+
33
+ # check if col is already a number; if so, add col directly
34
+ # to new dataframe and skip to next column
35
+ if isinstance(colval.values[0], (np.integer, float)):
36
+ data[colname] = uncleaned_data[colname].copy()
37
+ continue
38
+
39
+ # structure: {0: "lilac", 1: "blue", ...}
40
+ new_dict = {}
41
+ val = 0 # first index per column
42
+ transformed_col_vals = [] # new numeric datapoints
43
+
44
+ # if not, for each item in that column...
45
+ for (row, item) in enumerate(colval.values):
46
+
47
+ # if item is not in this col's dict...
48
+ if item not in new_dict:
49
+ new_dict[item] = val
50
+ val += 1
51
+
52
+ # then add numerical value to transformed dataframe
53
+ transformed_col_vals.append(new_dict[item])
54
+
55
+ # reverse dictionary only for final col (0, 1) => (vals)
56
+ if colname == final_colname:
57
+ new_dict = {value : key for (key, value) in new_dict.items()}
58
+
59
+ cat_value_dicts[colname] = new_dict
60
+ data[colname] = transformed_col_vals
61
+
62
+
63
+ ### -------------------------------- ###
64
+ ### model training ###
65
+ ### -------------------------------- ###
66
+
67
+ # select features and predicton; automatically selects last column as prediction
68
+ cols = len(data.columns)
69
+ num_features = cols - 1
70
+ x = data.iloc[: , :num_features]
71
+ y = data.iloc[: , num_features:]
72
+
73
+ # split data into training and testing sets
74
+ x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
75
+
76
+ # instantiate the model (using default parameters)
77
+ model = LogisticRegression()
78
+ model.fit(x_train, y_train.values.ravel())
79
+ y_pred = model.predict(x_test)
80
+
81
+
82
+ ### -------------------------------- ###
83
+ ### article generation ###
84
+ ### -------------------------------- ###
85
+ # borrow file reading function from reader.py
86
+
87
+ # def get_feat():
88
+ # feats = [abs(x) for x in model.coef_[0]]
89
+ # max_val = max(feats)
90
+ # idx = feats.index(max_val)
91
+ # return data.columns[idx]
92
+
93
+ # acc = str(round(metrics.accuracy_score(y_test, y_pred) * 100, 1)) + '%**'
94
+ # most_imp_feat = get_feat() + "**"
95
+ # info = get_article(acc, most_imp_feat)
96
+
97
+
98
+
99
+ ### ------------------------------- ###
100
+ ### interface creation ###
101
+ ### ------------------------------- ###
102
+
103
+
104
+ # predictor for generic number of features
105
+ def general_predictor(*args):
106
+ features = []
107
+
108
+ # transform categorical input
109
+ for colname, arg in zip(data.columns, args):
110
+ if (colname in cat_value_dicts):
111
+ features.append(cat_value_dicts[colname][arg])
112
+ else:
113
+ features.append(arg)
114
+
115
+ # predict single datapoint
116
+ new_input = [features]
117
+ result = model.predict(new_input)
118
+ return cat_value_dicts[final_colname][result[0]]
119
+
120
+ # add data labels to replace those lost via star-args
121
+ inputls = []
122
+ for colname in data.columns:
123
+ # skip last column
124
+ if colname == final_colname:
125
+ continue
126
+
127
+ # access categories dict if data is categorical
128
+ # otherwise, just use a number input
129
+ if colname in cat_value_dicts:
130
+ radio_options = list(cat_value_dicts[colname].keys())
131
+ inputls.append(st.selectbox(colname, radio_options))
132
+ else:
133
+ # add numerical input
134
+ inputls.append(st.number_imput(colname))
135
+
136
+ # generate gradio interface
137
+ if st.button("Submit"):
138
+ x = pd.DataFrame([inputls], columns=data.columns)
139
+ prediction = model.predict(x)[0]
140
+
141
+ st.text(f"This instance is a {prediction}")