Spaces:

SatAT
/

transformer_service

Sleeping

App Files Files Community

transformer_service / app.py

SatAT

Update app.py

a348a05 over 2 years ago

raw

history blame

3.72 kB

	import numpy as np
	import torch
	import streamlit as st
	from transformers import BertTokenizer
	from transformers import BertForSequenceClassification
	from sklearn.preprocessing import LabelEncoder
	from keras.utils import pad_sequences
	from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

	st.markdown("### Hello, world!")
	st.markdown("<img width=200px src='https://rozetked.me/images/uploads/dwoilp3BVjlE.jpg'>", unsafe_allow_html=True)
	# ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter

	text = st.text_area("TEXT HERE")
	# ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент

	if torch.cuda.is_available():

	# Tell PyTorch to use the GPU.
	device = torch.device("cuda")

	print('There are %d GPU(s) available.' % torch.cuda.device_count())

	print('We will use the GPU:', torch.cuda.get_device_name(0))

	# If not...
	else:
	print('No GPU available, using the CPU instead.')
	device = torch.device("cpu")
	# Set the maximum sequence length.
	# I've chosen 64 somewhat arbitrarily. It's slightly larger than the
	# maximum training sentence length of 47...
	MAX_LEN = 64

	tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
	test_input_ids = []
	encoded_sent = tokenizer.encode(
	text, # Sentence to encode.
	add_special_tokens = True, # Add '[CLS]' and '[SEP]'

	# This function also supports truncation and conversion
	# to pytorch tensors, but we need to do padding, so we
	# can't use these features :( .
	#max_length = 128, # Truncate all sentences.
	#return_tensors = 'pt', # Return pytorch tensors.
	)
	#tkns = tokenized_sub_sentence
	indexed_tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(str(text)))#le.convert_tokens_to_ids(tkns)
	segments_ids = [0] * len(indexed_tokens)

	tokens_tensor = torch.tensor([indexed_tokens])#.to(device)
	segments_tensors = torch.tensor([segments_ids])#.to(device)

	model = BertForSequenceClassification.from_pretrained(
	"bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
	num_labels = 44, # The number of output labels--2 for binary classification.
	# You can increase this for multi-class tasks.
	output_attentions = False, # Whether the model returns attentions weights.
	output_hidden_states = False, # Whether the model returns all hidden-states.
	)
	model.load_state_dict(torch.load("model_last_version.pt", map_location=torch.device('cpu')))
	# model.to(device)
	model.eval()
	with torch.no_grad():
	logit = model(tokens_tensor,
	token_type_ids=None,
	attention_mask=segments_tensors)

	logit_new = logit[0].argmax(2).detach().cpu().numpy().tolist()
	prediction = logit_new[0]

	# Creating a instance of label Encoder.
	le = LabelEncoder()
	# print("Predict: ", le.inverse_transform(flat_predictions))

	# from transformers import pipeline
	# pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl")
	raw_predictions = le.inverse_transform(prediction)#pipe(text)
	# тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost

	st.markdown(f"{raw_predictions}")
	# выводим результаты модели в текстовое поле, на потеху пользователю