import numpy as np import torch import streamlit as st from transformers import BertTokenizer from transformers import BertForSequenceClassification from sklearn.preprocessing import LabelEncoder from keras.utils import pad_sequences from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler st.markdown("### Hello, world!") st.markdown("", unsafe_allow_html=True) # ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter text = st.text_area("TEXT HERE") # ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент if torch.cuda.is_available(): # Tell PyTorch to use the GPU. device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not... else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") # Set the maximum sequence length. # I've chosen 64 somewhat arbitrarily. It's slightly larger than the # maximum training sentence length of 47... MAX_LEN = 64 tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") test_input_ids = [] encoded_sent = tokenizer.encode( text, # Sentence to encode. add_special_tokens = True, # Add '[CLS]' and '[SEP]' # This function also supports truncation and conversion # to pytorch tensors, but we need to do padding, so we # can't use these features :( . #max_length = 128, # Truncate all sentences. #return_tensors = 'pt', # Return pytorch tensors. ) # Add the encoded sentence to the list. test_input_ids.append(encoded_sent) test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") # Create attention masks attention_masks = [] # Create a mask of 1s for each token followed by 0s for padding for seq in test_input_ids: seq_mask = [float(i>0) for i in seq] attention_masks.append(seq_mask) # Convert to tensors. prediction_inputs = torch.tensor(test_input_ids) prediction_masks = torch.tensor(attention_masks) prediction_data = TensorDataset(prediction_inputs, prediction_masks, []) prediction_sampler = SequentialSampler(prediction_data) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=1) # Put model in evaluation mode model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels = 44, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions = False, # Whether the model returns attentions weights. output_hidden_states = False, # Whether the model returns all hidden-states. ) model.load_state_dict(torch.load("model_last_version.pt")) model.to(device) model.eval() # Tracking variables predictions, true_labels = [], [] # Predict for batch in prediction_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Telling the model not to compute or store gradients, saving memory and # speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Store predictions and true labels predictions.append(logits) true_labels.append(label_ids) flat_predictions = [item for sublist in predictions for item in sublist] flat_predictions = np.argmax(flat_predictions, axis=1).flatten() # Creating a instance of label Encoder. le = LabelEncoder() # print("Predict: ", le.inverse_transform(flat_predictions)) # from transformers import pipeline # pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl") raw_predictions = le.inverse_transform(flat_predictions)#pipe(text) # тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost st.markdown(f"{raw_predictions}") # выводим результаты модели в текстовое поле, на потеху пользователю