File size: 4,331 Bytes
6832fea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import gradio as gr
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from reader import get_article


### ------------------------------ ###
###       data transformation      ###
### ------------------------------ ###
# options constants
options = [
  ['Very Poorly Aligned', 'Poorly Aligned', 'Neutrally Aligned', 'Well Aligned', 'Very Well Aligned'],
  ['Very Limited Experience', 'Limited Experience', 'Neutral Experience', 'Extensive Experience', 'Very Extensive Experience'],
  ['Extremely Unattractive', 'Moderately Unattractive', 'Neutrally Attractive', 'Moderately Attractive', 'Extremely Attractive'],
  ['Very Unfavorable', 'Moderately Unfavorable', 'Neutrally Favorable', 'Moderately Favorable', 'Very Favorable'],
  ['Very Poor Fit', 'Poor Fit', 'Neutral Fit', 'Moderately Good Fit', 'Excellent Fit']
]

# load dataset
uncleaned_data = pd.read_csv('data.csv')
data = pd.DataFrame()

# keep track of which columns are categorical and what 
# those columns' value mappings are
# structure: {colname1: {...}, colname2: {...} }
cat_value_dicts = {}
col = 0
final_colname = uncleaned_data.columns[4]

# for each column...
for (colname, colval) in uncleaned_data.iteritems():

  # structure: {0: "lilac", 1: "blue", ...}
  new_dict = {}
  transformed_col_vals = [] # new numeric datapoints

  # if not, for each item in that column...
  for (row, item) in enumerate(colval.values):

    # if item is not in this col's dict...
    if item not in new_dict:
      new_dict[item] = options[col].index(item)
    
    # then add numerical value to transformed dataframe
    transformed_col_vals.append(new_dict[item])
  
  # reverse dictionary only for final col (0, 1) => (vals)
  if colname == final_colname:
    new_dict = {value : key for (key, value) in new_dict.items()}

  cat_value_dicts[colname] = new_dict
  data[colname] = transformed_col_vals
  col += 1


### -------------------------------- ###
###           model training         ###
### -------------------------------- ###

# select features and predicton; automatically selects last column as prediction
num_features = 4
x = data.iloc[: , :num_features]
y = data.iloc[: , num_features:]

# split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

# instantiate the model (using default parameters)
model = LogisticRegression(max_iter=100)
model.fit(x_train, y_train.values.ravel())
y_pred = model.predict(x_test)


### -------------------------------- ###
###        article generation        ###
### -------------------------------- ###
# borrow file reading function from reader.py

def get_feats():
  feats = [abs(x) for x in model.coef_[0]]
  feats, cols = zip(*sorted(zip(feats, data.columns)))

  output = []

  for idx, col in enumerate(reversed(cols)):
    output.append(col)
  
  # max_val = max(feats)
  # idx = feats.index(max_val)
  # return data.columns[idx]
  return output
  
acc = str(round(metrics.accuracy_score(y_test, y_pred) * 100, 2)) + '%'
feats = get_feats()
info = get_article(acc, feats)



### ------------------------------- ###
###        interface creation       ###
### ------------------------------- ###

def predictor(*args):
  features = []

  # transform categorical input
  for num, col in enumerate(args):
    features.append(cat_value_dicts[data.columns[num]][col])

  # predict single datapoint
  new_input = [features]
  result = model.predict(new_input)
  return cat_value_dicts[final_colname][result[0]]

# add data labels to replace those lost via star-args
inputls = []
labels = [
  "How Well Do They Align with RS21's 9 Core Values?",
  "How Experienced Are They in RS21's Markets?",
  "How Attractive is Their Valuation of RS21?",
  "How Favorable is Their Proposed Deal Structure for RS21?"
]

for num, colname in enumerate(labels):

  # access categories dict if data is categorical
  inputls.append(gr.inputs.Radio(choices=options[num], type="value", label=labels[num]))

  
# generate gradio interface
interface = gr.Interface(predictor, inputs=inputls, outputs="text", article=info['article'], css=info['css'], theme="grass", title=info['title'], allow_flagging='never', description=info['description'])

# show the interface 
interface.launch()