icd10_docker / required_classes.py
lyangas
add first code-by-group model
e1eb682
raw
history blame
3.64 kB
import numpy as np
from typing import List
class BertEmbedder:
def __init__(self, model_path:str, cut_head:bool=False):
"""
cut_head = True if the model have classifier head
"""
self.embedder = BertForSequenceClassification.from_pretrained(model_path)
self.max_length = self.embedder.config.max_position_embeddings
self.tokenizer = AutoTokenizer.from_pretrained(model_path, max_length=self.max_length)
if cut_head:
self.embedder = self.embedder.bert
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.embedder.to(self.device)
def __call__(self, text: str):
encoded_input = self.tokenizer(text,
return_tensors='pt',
max_length=self.max_length,
padding=True,
truncation=True).to(self.device)
model_output = self.embedder(**encoded_input)
text_embed = model_output.pooler_output[0].cpu()
return text_embed.tolist()
def batch_predict(self, texts: List[str]):
encoded_input = self.tokenizer(texts,
return_tensors='pt',
max_length=self.max_length,
padding=True,
truncation=True).to(self.device)
model_output = self.embedder(**encoded_input)
texts_embeds = model_output.pooler_output.cpu()
return texts_embeds
class PredictModel:
def __init__(self, embedder, classifier, batch_size=8):
self.batch_size = batch_size
self.embedder = embedder
self.classifier = classifier
def _texts2vecs(self, texts, log=False):
embeds = []
batches_texts = np.array_split(texts, len(texts) // self.batch_size)
if log:
iterator = tqdm(batches_texts)
else:
iterator = batches_texts
for batch_texts in iterator:
batch_texts = batch_texts.tolist()
embeds += self.embedder.batch_predict(batch_texts).tolist()
embeds = np.array(embeds)
return embeds
def fit(self, texts: List[str], labels: List[str], log: bool=False):
if log:
print('Start text2vec transform')
embeds = self._texts2vecs(texts, log)
if log:
print('Start classifier fitting')
self.classifier.fit(embeds, labels)
def predict(self, texts: List[str], log: bool=False):
if log:
print('Start text2vec transform')
embeds = self._texts2vecs(texts, log)
if log:
print('Start classifier prediction')
prediction = self.classifier.predict(embeds)
return prediction
class CustomXGBoost:
def __init__(self):
self.model = xgb.XGBClassifier()
self.classes_ = None
def fit(self, X, y):
self.classes_ = np.unique(y).tolist()
y = [self.classes_.index(l) for l in y]
self.model.fit(X, y)
def predict_proba(self, X):
pred = self.model.predict_proba(X)
return pred
def predict(self, X):
preds = self.model.predict_proba(X)
print(np.argmax(preds, axis=1), self.classes_)
print(preds.shape, preds[:2])
return self.classes_[np.argmax(preds, axis=1)]
class SimpleModel:
def __init__(self):
self.classes_ = None
def fit(self, X, y):
self.classes_ = [y[0]]
def predict_proba(self, X):
return np.array([1.0])