SatAT commited on
Commit
1a0448c
·
1 Parent(s): fb4531a

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +119 -0
  2. model_last_version.pt +3 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import streamlit as st
4
+ from transformers import BertTokenizer
5
+ from transformers import BertForSequenceClassification
6
+ from sklearn.preprocessing import LabelEncoder
7
+ from keras.utils import pad_sequences
8
+ from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
9
+
10
+ st.markdown("### Hello, world!")
11
+ st.markdown("<img width=200px src='https://rozetked.me/images/uploads/dwoilp3BVjlE.jpg'>", unsafe_allow_html=True)
12
+ # ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter
13
+
14
+ text = st.text_area("TEXT HERE")
15
+ # ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент
16
+
17
+ if torch.cuda.is_available():
18
+
19
+ # Tell PyTorch to use the GPU.
20
+ device = torch.device("cuda")
21
+
22
+ print('There are %d GPU(s) available.' % torch.cuda.device_count())
23
+
24
+ print('We will use the GPU:', torch.cuda.get_device_name(0))
25
+
26
+ # If not...
27
+ else:
28
+ print('No GPU available, using the CPU instead.')
29
+ device = torch.device("cpu")
30
+ # Set the maximum sequence length.
31
+ # I've chosen 64 somewhat arbitrarily. It's slightly larger than the
32
+ # maximum training sentence length of 47...
33
+ MAX_LEN = 64
34
+
35
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
36
+ test_input_ids = []
37
+ encoded_sent = tokenizer.encode(
38
+ text, # Sentence to encode.
39
+ add_special_tokens = True, # Add '[CLS]' and '[SEP]'
40
+
41
+ # This function also supports truncation and conversion
42
+ # to pytorch tensors, but we need to do padding, so we
43
+ # can't use these features :( .
44
+ #max_length = 128, # Truncate all sentences.
45
+ #return_tensors = 'pt', # Return pytorch tensors.
46
+ )
47
+ # Add the encoded sentence to the list.
48
+ test_input_ids.append(encoded_sent)
49
+ test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN,
50
+ dtype="long", truncating="post", padding="post")
51
+ # Create attention masks
52
+ attention_masks = []
53
+
54
+ # Create a mask of 1s for each token followed by 0s for padding
55
+ for seq in test_input_ids:
56
+ seq_mask = [float(i>0) for i in seq]
57
+ attention_masks.append(seq_mask)
58
+
59
+ # Convert to tensors.
60
+ prediction_inputs = torch.tensor(test_input_ids)
61
+ prediction_masks = torch.tensor(attention_masks)
62
+ prediction_data = TensorDataset(prediction_inputs, prediction_masks, [])
63
+ prediction_sampler = SequentialSampler(prediction_data)
64
+ prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=1)
65
+ # Put model in evaluation mode
66
+ model = BertForSequenceClassification.from_pretrained(
67
+ "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
68
+ num_labels = 44, # The number of output labels--2 for binary classification.
69
+ # You can increase this for multi-class tasks.
70
+ output_attentions = False, # Whether the model returns attentions weights.
71
+ output_hidden_states = False, # Whether the model returns all hidden-states.
72
+ )
73
+ model.load_state_dict(torch.load("model_last_version.pt"))
74
+ model.to(device)
75
+ model.eval()
76
+
77
+ # Tracking variables
78
+ predictions, true_labels = [], []
79
+
80
+ # Predict
81
+ for batch in prediction_dataloader:
82
+ # Add batch to GPU
83
+ batch = tuple(t.to(device) for t in batch)
84
+
85
+ # Unpack the inputs from our dataloader
86
+ b_input_ids, b_input_mask, b_labels = batch
87
+
88
+ # Telling the model not to compute or store gradients, saving memory and
89
+ # speeding up prediction
90
+ with torch.no_grad():
91
+ # Forward pass, calculate logit predictions
92
+ outputs = model(b_input_ids, token_type_ids=None,
93
+ attention_mask=b_input_mask)
94
+
95
+ logits = outputs[0]
96
+
97
+ # Move logits and labels to CPU
98
+ logits = logits.detach().cpu().numpy()
99
+ label_ids = b_labels.to('cpu').numpy()
100
+
101
+ # Store predictions and true labels
102
+ predictions.append(logits)
103
+ true_labels.append(label_ids)
104
+
105
+ flat_predictions = [item for sublist in predictions for item in sublist]
106
+ flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
107
+
108
+
109
+ # Creating a instance of label Encoder.
110
+ le = LabelEncoder()
111
+ # print("Predict: ", le.inverse_transform(flat_predictions))
112
+
113
+ # from transformers import pipeline
114
+ # pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl")
115
+ raw_predictions = le.inverse_transform(flat_predictions)#pipe(text)
116
+ # тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost
117
+
118
+ st.markdown(f"{raw_predictions}")
119
+ # выводим результаты модели в текстовое поле, на потеху пользователю
model_last_version.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:848192683e94e8d65f6c556d5177ef557541453cab886762bef55363a94bedbf
3
+ size 438152113
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ numpy
3
+ transformers