Spaces:
Runtime error
Runtime error
import numpy as np | |
from typing import List | |
import pandas as pd | |
import torch | |
import xgboost as xgb | |
from transformers import AutoTokenizer, BertForSequenceClassification | |
from tqdm import tqdm | |
class BertEmbedder: | |
def __init__(self, tokenizer_path:str, model_path:str, cut_head:bool=False): | |
""" | |
cut_head = True if the model have classifier head | |
""" | |
self.embedder = BertForSequenceClassification.from_pretrained(model_path) | |
self.max_length = self.embedder.config.max_position_embeddings | |
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, max_length=self.max_length) | |
if cut_head: | |
self.embedder = self.embedder.bert | |
self.device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
print(f"Used device for BERT: {self.device }", flush=True) | |
self.embedder.to(self.device) | |
def __call__(self, text: str): | |
encoded_input = self.tokenizer(text, | |
return_tensors='pt', | |
max_length=self.max_length, | |
padding=True, | |
truncation=True).to(self.device) | |
model_output = self.embedder(**encoded_input) | |
text_embed = model_output.pooler_output[0].cpu() | |
return text_embed | |
def batch_predict(self, texts: List[str]): | |
encoded_input = self.tokenizer(texts, | |
return_tensors='pt', | |
max_length=self.max_length, | |
padding=True, | |
truncation=True).to(self.device) | |
model_output = self.embedder(**encoded_input) | |
texts_embeds = model_output.pooler_output.cpu() | |
return texts_embeds | |
class PredictModel: | |
def __init__(self, embedder, classifier_code, classifier_group, batch_size=8): | |
self.batch_size = batch_size | |
self.embedder = embedder | |
self.classifier_code = classifier_code | |
self.classifier_group = classifier_group | |
def _texts2vecs(self, texts, logging=False): | |
embeds = [] | |
batches_texts = np.array_split(texts, len(texts) // self.batch_size) | |
if logging: | |
iterator = tqdm(batches_texts) | |
else: | |
iterator = batches_texts | |
for batch_texts in iterator: | |
batch_texts = batch_texts.tolist() | |
embeds += self.embedder.batch_predict(batch_texts).tolist() | |
embeds = np.array(embeds) | |
return embeds | |
def fit(self, texts: List[str], labels: List[str], logging: bool=False): | |
if logging: | |
print('Start text2vec transform') | |
embeds = self._texts2vecs(texts, logging) | |
if logging: | |
print('Start codes-classifier fitting') | |
self.classifier_code.fit(embeds, labels) | |
labels = [l.split('.')[0] for l in labels] | |
if logging: | |
print('Start groups-classifier fitting') | |
self.classifier_group.fit(embeds, labels) | |
def predict_code(self, texts: List[str], log: bool=False): | |
if log: | |
print('Start text2vec transform') | |
embeds = self._texts2vecs(texts, log) | |
if log: | |
print('Start classifier prediction') | |
prediction = self.classifier_code.predict(embeds) | |
return prediction | |
def predict_group(self, texts: List[str], logging: bool=False): | |
if logging: | |
print('Start text2vec transform') | |
embeds = self._texts2vecs(texts, logging) | |
if logging: | |
print('Start classifier prediction') | |
prediction = self.classifier_group.predict(embeds) | |
return prediction | |
class CustomXGBoost: | |
def __init__(self, use_gpu): | |
if use_gpu: | |
self.model = xgb.XGBClassifier(tree_method="gpu_hist") | |
else: | |
self.model = xgb.XGBClassifier() | |
self.classes_ = None | |
def fit(self, X, y, **kwargs): | |
self.classes_ = np.unique(y).tolist() | |
y = [self.classes_.index(l) for l in y] | |
self.model.fit(X, y, **kwargs) | |
def predict_proba(self, X): | |
pred = self.model.predict_proba(X) | |
return pred | |
def predict(self, X): | |
preds = self.model.predict_proba(X) | |
return np.array([self.classes_[p] for p in np.argmax(preds, axis=1)]) | |
class SimpleModel: | |
def __init__(self): | |
self.classes_ = None | |
def fit(self, X, y): | |
print(y[0]) | |
self.classes_ = [y[0]] | |
def predict_proba(self, X): | |
return np.array([[1.0]] * len(X)) | |
def balance_dataset(labels_train_for_group, vecs_train_for_group, balance=None, logging=True): | |
if balance == 'remove': | |
min_len = -1 | |
for code_l in np.unique(labels_train_for_group): | |
cur_len = sum(labels_train_for_group==code_l) | |
if logging: | |
print(code_l, cur_len) | |
if min_len > cur_len or min_len==-1: | |
min_len = cur_len | |
if logging: | |
print('min_len is', min_len) | |
df_train_group = pd.DataFrame() | |
df_train_group['labels'] = labels_train_for_group | |
df_train_group['vecs'] = vecs_train_for_group.tolist() | |
df_train_group = df_train_group.groupby('labels', as_index=False).apply(lambda array: array.loc[np.random.choice(array.index, min_len, False),:]) | |
labels_train_for_group = df_train_group['labels'].values | |
vecs_train_for_group = [np.array(v) for v in df_train_group['vecs'].values] | |
elif balance == 'duplicate': | |
df_train_group = pd.DataFrame() | |
df_train_group['labels'] = labels_train_for_group | |
df_train_group['vecs'] = vecs_train_for_group.tolist() | |
max_len = 0 | |
for code_data in df_train_group.groupby('labels'): | |
cur_len = len(code_data[1]) | |
if logging: | |
print(code_data[0], cur_len) | |
if max_len < cur_len: | |
max_len = cur_len | |
if logging: | |
print('max_len is ', max_len) | |
labels_train_for_group = [] | |
vecs_train_for_group = [] | |
for code_data in df_train_group.groupby('labels'): | |
cur_len = len(code_data[1]) | |
cur_labels = code_data[1]['labels'].values.tolist() | |
cur_vecs = code_data[1]['vecs'].values.tolist() | |
while cur_len < max_len: | |
cur_len *= 2 | |
cur_labels += cur_labels | |
cur_vecs += cur_vecs | |
cur_labels = cur_labels[:max_len] | |
cur_vecs = cur_vecs[:max_len] | |
labels_train_for_group += cur_labels | |
vecs_train_for_group += cur_vecs | |
labels_train_for_group = np.array(labels_train_for_group) | |
vecs_train_for_group = np.array(vecs_train_for_group) | |
return labels_train_for_group, vecs_train_for_group |