lyangas commited on
Commit
1bb21bd
1 Parent(s): 8e8bb53

first demo code added

Browse files
app.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ print('INFO: import modules')
2
+ import gradio as gr
3
+ import pickle
4
+ from required_classes import *
5
+
6
+
7
+ print('INFO: loading model')
8
+ try:
9
+ with open('pretrain_logistic_regression_model.pkl', 'rb') as f:
10
+ model = pickle.load(f)
11
+ model.batch_size = 1
12
+ print('INFO: model loaded')
13
+ except Exception as e:
14
+ print(f"ERROR: loading model failed with: {str(e)}")
15
+
16
+ def classify(text):
17
+ pred_classes = model.predict([text])
18
+ output_text = ' '.join(pred_classes)
19
+ return output_text
20
+
21
+ print('INFO: starting gradio interface')
22
+ iface = gr.Interface(fn=classify, inputs="text", outputs="text")
23
+ iface.launch()
pretrain_logistic_regression_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b16160b18246480dcac36128ff59da49c4410472a98bc4955a4841655f7ed50
3
+ size 434671977
required_classes.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import List
3
+
4
+
5
+ class BertEmbedder:
6
+ def __init__(self, model_path:str, cut_head:bool=False):
7
+ """
8
+ cut_head = True if the model have classifier head
9
+ """
10
+ self.embedder = BertForSequenceClassification.from_pretrained(model_path)
11
+ self.max_length = self.embedder.config.max_position_embeddings
12
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path, max_length=self.max_length)
13
+
14
+ if cut_head:
15
+ self.embedder = self.embedder.bert
16
+
17
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
18
+ self.embedder.to(self.device)
19
+
20
+ def __call__(self, text: str):
21
+ encoded_input = self.tokenizer(text,
22
+ return_tensors='pt',
23
+ max_length=self.max_length,
24
+ padding=True,
25
+ truncation=True).to(self.device)
26
+ model_output = self.embedder(**encoded_input)
27
+ text_embed = model_output.pooler_output[0].cpu()
28
+ return text_embed
29
+
30
+ def batch_predict(self, texts: List[str]):
31
+ encoded_input = self.tokenizer(texts,
32
+ return_tensors='pt',
33
+ max_length=self.max_length,
34
+ padding=True,
35
+ truncation=True).to(self.device)
36
+ model_output = self.embedder(**encoded_input)
37
+ texts_embeds = model_output.pooler_output.cpu()
38
+ return texts_embeds
39
+
40
+ class PredictModel:
41
+ def __init__(self, embedder, classifier, batch_size=8):
42
+ self.batch_size = batch_size
43
+ self.embedder = embedder
44
+ self.classifier = classifier
45
+
46
+ def _texts2vecs(self, texts, log=False):
47
+ embeds = []
48
+ batches_texts = np.array_split(texts, len(texts) // self.batch_size)
49
+ if log:
50
+ iterator = tqdm(batches_texts)
51
+ else:
52
+ iterator = batches_texts
53
+ for batch_texts in iterator:
54
+ batch_texts = batch_texts.tolist()
55
+ embeds += self.embedder.batch_predict(batch_texts).tolist()
56
+ embeds = np.array(embeds)
57
+ return embeds
58
+
59
+ def fit(self, texts: List[str], labels: List[str], log: bool=False):
60
+ if log:
61
+ print('Start text2vec transform')
62
+ embeds = self._texts2vecs(texts, log)
63
+ if log:
64
+ print('Start classifier fitting')
65
+ self.classifier.fit(embeds, labels)
66
+
67
+ def predict(self, texts: List[str], log: bool=False):
68
+ if log:
69
+ print('Start text2vec transform')
70
+ embeds = self._texts2vecs(texts, log)
71
+ if log:
72
+ print('Start classifier prediction')
73
+ prediction = self.classifier.predict(embeds)
74
+ return prediction
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy==1.22.4
2
+ torch==2.0.1
3
+ scikit-learn==1.2.2
4
+ transformers==4.29.2
5
+ gradio==3.11