lyangas commited on
Commit
da7535b
β€’
1 Parent(s): afc3da6

add voting

Browse files
Dockerfile CHANGED
@@ -6,6 +6,8 @@ COPY ./requirements.txt /code/requirements.txt
6
 
7
  RUN pip install --upgrade -r /code/requirements.txt
8
 
 
 
9
  COPY ./model_finetuned_clear.pkl ./model_finetuned_clear.pkl
10
  COPY ./required_classes.py ./required_classes.py
11
  COPY ./app.py ./app.py
 
6
 
7
  RUN pip install --upgrade -r /code/requirements.txt
8
 
9
+ COPY ./embedder ./embedder
10
+ COPY ./classifiers ./classifiers
11
  COPY ./model_finetuned_clear.pkl ./model_finetuned_clear.pkl
12
  COPY ./required_classes.py ./required_classes.py
13
  COPY ./app.py ./app.py
app.py CHANGED
@@ -4,33 +4,86 @@ from flask import Flask, request
4
  import json
5
  import pickle
6
  import numpy as np
 
7
 
8
  from required_classes import BertEmbedder, PredictModel
9
 
10
 
11
- print('INFO: loading model')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  try:
13
- with open('model_finetuned_clear.pkl', 'rb') as f:
14
- model = pickle.load(f)
15
- model.batch_size = 1
16
- print('INFO: model loaded')
 
 
 
17
  except Exception as e:
18
- print(f"ERROR: loading models failed with: {str(e)}")
 
19
 
20
  def classify_code(text, top_n):
21
- embed = model._texts2vecs([text])
22
- probs = model.classifier_code.predict_proba(embed)
23
- best_n = np.flip(np.argsort(probs, axis=1,)[0,-top_n:])
24
- preds = [{'code': model.classifier_code.classes_[i], 'proba': probs[0][i]} for i in best_n]
 
 
 
 
25
  return preds
26
 
 
27
  def classify_group(text, top_n):
28
- embed = model._texts2vecs([text])
29
- probs = model.classifier_group.predict_proba(embed)
30
- best_n = np.flip(np.argsort(probs, axis=1,)[0,-top_n:])
31
- preds = [{'group': model.classifier_group.classes_[i], 'proba': probs[0][i]} for i in best_n]
 
 
 
 
32
  return preds
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  app = Flask(__name__)
36
 
@@ -49,8 +102,6 @@ def read_root():
49
  base64_bytes = str(data['textB64']).encode("ascii")
50
  sample_string_bytes = base64.b64decode(base64_bytes)
51
  text = sample_string_bytes.decode("ascii")
52
- print(text)
53
- # text = str(data['text'])
54
  top_n = int(data['top_n'])
55
 
56
  if top_n < 1:
@@ -60,11 +111,13 @@ def read_root():
60
 
61
  pred_codes = classify_code(text, top_n)
62
  pred_groups = classify_group(text, top_n)
 
 
63
  result = {
64
  "icd10":
65
- {'result': pred_codes[0]['code'], 'details': pred_codes},
66
  "dx_group":
67
- {'result': pred_groups[0]['group'], 'details': pred_groups}
68
  }
69
  return result
70
 
 
4
  import json
5
  import pickle
6
  import numpy as np
7
+ import os
8
 
9
  from required_classes import BertEmbedder, PredictModel
10
 
11
 
12
+ CLS_WEIGHTS = {'mlp': 0.3, 'svc': 0.4, 'xgboost': 0.3}
13
+
14
+ print('INFO: loading models')
15
+ try:
16
+ with open('embedder/embedder.pkl', 'rb') as f:
17
+ embedder = pickle.load(f)
18
+ print('INFO: embedder loaded')
19
+ except Exception as e:
20
+ print(f"ERROR: loading embedder failed with: {str(e)}")
21
+
22
+
23
+ classifiers_codes = {}
24
+ try:
25
+ for clf_name in os.listdir('classifiers/codes'):
26
+ if '.' == clf_name[0]:
27
+ continue
28
+ with open('classifiers/codes/'+clf_name, 'rb') as f:
29
+ model = pickle.load(f)
30
+ classifiers_codes[clf_name.split('.')[0]] = model
31
+ print(f'INFO: classifier {clf_name} loaded')
32
+ except Exception as e:
33
+ print(f"ERROR: loading classifiers failed with: {str(e)}")
34
+
35
+ classifiers_groups = {}
36
  try:
37
+ for clf_name in os.listdir('classifiers/groups'):
38
+ if '.' == clf_name[0]:
39
+ continue
40
+ with open('classifiers/groups/'+clf_name, 'rb') as f:
41
+ model = pickle.load(f)
42
+ classifiers_groups[clf_name.split('.')[0]] = model
43
+ print(f'INFO: classifier {clf_name} loaded')
44
  except Exception as e:
45
+ print(f"ERROR: loading classifiers failed with: {str(e)}")
46
+
47
 
48
  def classify_code(text, top_n):
49
+ embed = [embedder(text)]
50
+ preds = {}
51
+ for clf_name in classifiers_codes.keys():
52
+ model = classifiers_codes[clf_name]
53
+ probs = model.predict_proba(embed)
54
+ best_n = np.flip(np.argsort(probs, axis=1,)[0,-top_n:])
55
+ clf_preds = {str(model.classes_[i]): float(probs[0][i]) for i in best_n}
56
+ preds[clf_name] = clf_preds
57
  return preds
58
 
59
+
60
  def classify_group(text, top_n):
61
+ embed = [embedder(text)]
62
+ preds = {}
63
+ for clf_name in classifiers_groups.keys():
64
+ model = classifiers_groups[clf_name]
65
+ probs = model.predict_proba(embed)
66
+ best_n = np.flip(np.argsort(probs, axis=1,)[0,-top_n:])
67
+ clf_preds = {str(model.classes_[i]): float(probs[0][i]) for i in best_n}
68
+ preds[clf_name] = clf_preds
69
  return preds
70
 
71
+ def get_top_result(preds):
72
+ total_scores = {}
73
+ for clf_name, scores in preds.items():
74
+ clf_name = clf_name.replace('_codes', '').replace('_groups', '')
75
+ for class_name, score in scores.items():
76
+ if class_name in total_scores:
77
+ total_scores[class_name] += CLS_WEIGHTS[clf_name] * score
78
+ else:
79
+ total_scores[class_name] = CLS_WEIGHTS[clf_name] * score
80
+
81
+ max_idx = np.array(total_scores.values()).argmax()
82
+ if list(total_scores.values())[max_idx] > 0.5:
83
+ return list(total_scores.keys())[max_idx]
84
+ else:
85
+ return None
86
+
87
 
88
  app = Flask(__name__)
89
 
 
102
  base64_bytes = str(data['textB64']).encode("ascii")
103
  sample_string_bytes = base64.b64decode(base64_bytes)
104
  text = sample_string_bytes.decode("ascii")
 
 
105
  top_n = int(data['top_n'])
106
 
107
  if top_n < 1:
 
111
 
112
  pred_codes = classify_code(text, top_n)
113
  pred_groups = classify_group(text, top_n)
114
+ pred_codes_top = get_top_result(pred_codes)
115
+ pred_groups_top = get_top_result(pred_groups)
116
  result = {
117
  "icd10":
118
+ {'result': pred_codes_top, 'details': pred_codes},
119
  "dx_group":
120
+ {'result': pred_groups_top, 'details': pred_groups}
121
  }
122
  return result
123
 
classifiers/.DS_Store ADDED
Binary file (6.15 kB). View file
 
classifiers/codes/.DS_Store ADDED
Binary file (6.15 kB). View file
 
model_finetuned_clear.pkl β†’ classifiers/codes/mlp_codes.pkl RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c40076019c4b4767021bf208200a8104f0910669d0b56952e6b2eb62b1539d3
3
- size 434856921
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39eda1dc2f583525dd689e048b396f476835c42ed2a8fead4884b6c87bad639d
3
+ size 5185392
classifiers/codes/svc_codes.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:669ddbb9dea179f38dcc5e94282074240eac2cb361e839416a4fc74ea718050d
3
+ size 865684575
classifiers/codes/xgboost_codes.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49a24f3740770aca1f546ccf44e2d2346ba18105920691255b587026cba962d7
3
+ size 17972293
classifiers/groups/.DS_Store ADDED
Binary file (6.15 kB). View file
 
classifiers/groups/mlp_groups.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f494f298b214d3239be0a53ae986b6eadf6822ccb77fcfdf8149b1aad80c87f2
3
+ size 4924655
classifiers/groups/svc_groups.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85d1f0874ebdc3986d412e6d8508d979a324c278d70b29dc4491477b209e69fa
3
+ size 574657606
classifiers/groups/xgboost_groups.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3f1d2fd1f46c18e3bf6b0b67ae8a6c6b52c8b4e06961eff4568cebee7f68989
3
+ size 8361773
embedder/embedder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b27f378ed88ce2ae9e01e9fd4e5706fc2072957b665c4d12e02a84bbe4f4c4d8
3
+ size 434228972
required_classes.py CHANGED
@@ -25,7 +25,7 @@ class BertEmbedder:
25
  truncation=True).to(self.device)
26
  model_output = self.embedder(**encoded_input)
27
  text_embed = model_output.pooler_output[0].cpu()
28
- return text_embed
29
 
30
  def batch_predict(self, texts: List[str]):
31
  encoded_input = self.tokenizer(texts,
 
25
  truncation=True).to(self.device)
26
  model_output = self.embedder(**encoded_input)
27
  text_embed = model_output.pooler_output[0].cpu()
28
+ return text_embed.tolist()
29
 
30
  def batch_predict(self, texts: List[str]):
31
  encoded_input = self.tokenizer(texts,
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  numpy==1.22.4
2
  torch==2.0.1
3
- scikit-learn==1.2.2
4
  transformers==4.29.2
5
  flask==2.0.3
 
 
1
  numpy==1.22.4
2
  torch==2.0.1
3
+ scikit-learn==1.3.0
4
  transformers==4.29.2
5
  flask==2.0.3
6
+ xgboost==1.7.6