opdx / helpers /required_classes.py
lyangas
missed files
6931ba0
raw
history blame
6.84 kB
import numpy as np
from typing import List
import pandas as pd
import torch
import xgboost as xgb
from transformers import AutoTokenizer, BertForSequenceClassification
from tqdm import tqdm
class BertEmbedder:
def __init__(self, tokenizer_path:str, model_path:str, cut_head:bool=False):
"""
cut_head = True if the model have classifier head
"""
self.embedder = BertForSequenceClassification.from_pretrained(model_path)
self.max_length = self.embedder.config.max_position_embeddings
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, max_length=self.max_length)
if cut_head:
self.embedder = self.embedder.bert
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Used device for BERT: {self.device }", flush=True)
self.embedder.to(self.device)
def __call__(self, text: str):
encoded_input = self.tokenizer(text,
return_tensors='pt',
max_length=self.max_length,
padding=True,
truncation=True).to(self.device)
model_output = self.embedder(**encoded_input)
text_embed = model_output.pooler_output[0].cpu()
return text_embed
def batch_predict(self, texts: List[str]):
encoded_input = self.tokenizer(texts,
return_tensors='pt',
max_length=self.max_length,
padding=True,
truncation=True).to(self.device)
model_output = self.embedder(**encoded_input)
texts_embeds = model_output.pooler_output.cpu()
return texts_embeds
class PredictModel:
def __init__(self, embedder, classifier_code, classifier_group, batch_size=8):
self.batch_size = batch_size
self.embedder = embedder
self.classifier_code = classifier_code
self.classifier_group = classifier_group
def _texts2vecs(self, texts, logging=False):
embeds = []
batches_texts = np.array_split(texts, len(texts) // self.batch_size)
if logging:
iterator = tqdm(batches_texts)
else:
iterator = batches_texts
for batch_texts in iterator:
batch_texts = batch_texts.tolist()
embeds += self.embedder.batch_predict(batch_texts).tolist()
embeds = np.array(embeds)
return embeds
def fit(self, texts: List[str], labels: List[str], logging: bool=False):
if logging:
print('Start text2vec transform')
embeds = self._texts2vecs(texts, logging)
if logging:
print('Start codes-classifier fitting')
self.classifier_code.fit(embeds, labels)
labels = [l.split('.')[0] for l in labels]
if logging:
print('Start groups-classifier fitting')
self.classifier_group.fit(embeds, labels)
def predict_code(self, texts: List[str], log: bool=False):
if log:
print('Start text2vec transform')
embeds = self._texts2vecs(texts, log)
if log:
print('Start classifier prediction')
prediction = self.classifier_code.predict(embeds)
return prediction
def predict_group(self, texts: List[str], logging: bool=False):
if logging:
print('Start text2vec transform')
embeds = self._texts2vecs(texts, logging)
if logging:
print('Start classifier prediction')
prediction = self.classifier_group.predict(embeds)
return prediction
class CustomXGBoost:
def __init__(self, use_gpu):
if use_gpu:
self.model = xgb.XGBClassifier(tree_method="gpu_hist")
else:
self.model = xgb.XGBClassifier()
self.classes_ = None
def fit(self, X, y, **kwargs):
self.classes_ = np.unique(y).tolist()
y = [self.classes_.index(l) for l in y]
self.model.fit(X, y, **kwargs)
def predict_proba(self, X):
pred = self.model.predict_proba(X)
return pred
def predict(self, X):
preds = self.model.predict_proba(X)
return np.array([self.classes_[p] for p in np.argmax(preds, axis=1)])
class SimpleModel:
def __init__(self):
self.classes_ = None
def fit(self, X, y):
print(y[0])
self.classes_ = [y[0]]
def predict_proba(self, X):
return np.array([[1.0]] * len(X))
def balance_dataset(labels_train_for_group, vecs_train_for_group, balance=None, logging=True):
if balance == 'remove':
min_len = -1
for code_l in np.unique(labels_train_for_group):
cur_len = sum(labels_train_for_group==code_l)
if logging:
print(code_l, cur_len)
if min_len > cur_len or min_len==-1:
min_len = cur_len
if logging:
print('min_len is', min_len)
df_train_group = pd.DataFrame()
df_train_group['labels'] = labels_train_for_group
df_train_group['vecs'] = vecs_train_for_group.tolist()
df_train_group = df_train_group.groupby('labels', as_index=False).apply(lambda array: array.loc[np.random.choice(array.index, min_len, False),:])
labels_train_for_group = df_train_group['labels'].values
vecs_train_for_group = [np.array(v) for v in df_train_group['vecs'].values]
elif balance == 'duplicate':
df_train_group = pd.DataFrame()
df_train_group['labels'] = labels_train_for_group
df_train_group['vecs'] = vecs_train_for_group.tolist()
max_len = 0
for code_data in df_train_group.groupby('labels'):
cur_len = len(code_data[1])
if logging:
print(code_data[0], cur_len)
if max_len < cur_len:
max_len = cur_len
if logging:
print('max_len is ', max_len)
labels_train_for_group = []
vecs_train_for_group = []
for code_data in df_train_group.groupby('labels'):
cur_len = len(code_data[1])
cur_labels = code_data[1]['labels'].values.tolist()
cur_vecs = code_data[1]['vecs'].values.tolist()
while cur_len < max_len:
cur_len *= 2
cur_labels += cur_labels
cur_vecs += cur_vecs
cur_labels = cur_labels[:max_len]
cur_vecs = cur_vecs[:max_len]
labels_train_for_group += cur_labels
vecs_train_for_group += cur_vecs
labels_train_for_group = np.array(labels_train_for_group)
vecs_train_for_group = np.array(vecs_train_for_group)
return labels_train_for_group, vecs_train_for_group