import numpy as np from typing import List class BertEmbedder: def __init__(self, model_path:str, cut_head:bool=False): """ cut_head = True if the model have classifier head """ self.embedder = BertForSequenceClassification.from_pretrained(model_path) self.max_length = self.embedder.config.max_position_embeddings self.tokenizer = AutoTokenizer.from_pretrained(model_path, max_length=self.max_length) if cut_head: self.embedder = self.embedder.bert self.device = "cuda:0" if torch.cuda.is_available() else "cpu" self.embedder.to(self.device) def __call__(self, text: str): encoded_input = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding=True, truncation=True).to(self.device) model_output = self.embedder(**encoded_input) text_embed = model_output.pooler_output[0].cpu() return text_embed.tolist() def batch_predict(self, texts: List[str]): encoded_input = self.tokenizer(texts, return_tensors='pt', max_length=self.max_length, padding=True, truncation=True).to(self.device) model_output = self.embedder(**encoded_input) texts_embeds = model_output.pooler_output.cpu() return texts_embeds class PredictModel: def __init__(self, embedder, classifier, batch_size=8): self.batch_size = batch_size self.embedder = embedder self.classifier = classifier def _texts2vecs(self, texts, log=False): embeds = [] batches_texts = np.array_split(texts, len(texts) // self.batch_size) if log: iterator = tqdm(batches_texts) else: iterator = batches_texts for batch_texts in iterator: batch_texts = batch_texts.tolist() embeds += self.embedder.batch_predict(batch_texts).tolist() embeds = np.array(embeds) return embeds def fit(self, texts: List[str], labels: List[str], log: bool=False): if log: print('Start text2vec transform') embeds = self._texts2vecs(texts, log) if log: print('Start classifier fitting') self.classifier.fit(embeds, labels) def predict(self, texts: List[str], log: bool=False): if log: print('Start text2vec transform') embeds = self._texts2vecs(texts, log) if log: print('Start classifier prediction') prediction = self.classifier.predict(embeds) return prediction class CustomXGBoost: def __init__(self): self.model = xgb.XGBClassifier() self.classes_ = None def fit(self, X, y): self.classes_ = np.unique(y).tolist() y = [self.classes_.index(l) for l in y] self.model.fit(X, y) def predict_proba(self, X): pred = self.model.predict_proba(X) return pred def predict(self, X): preds = self.model.predict_proba(X) print(np.argmax(preds, axis=1), self.classes_) print(preds.shape, preds[:2]) return self.classes_[np.argmax(preds, axis=1)] class SimpleModel: def __init__(self): self.classes_ = None def fit(self, X, y): self.classes_ = [y[0]] def predict_proba(self, X): return np.array([[1.0]] * len(X))