Spaces:

SatAT
/

transformer_service

Sleeping

App Files Files Community

transformer_service / app.py

SatAT

Update app.py

b98589b about 2 years ago

raw

history blame

4.45 kB

	import numpy as np
	import torch
	import streamlit as st
	from transformers import BertTokenizer
	from transformers import BertForSequenceClassification
	from sklearn.preprocessing import LabelEncoder
	from keras.utils import pad_sequences
	from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

	st.markdown("### Paper category classification")
	st.markdown("<img width=200px src='https://grandgames.net/img/upload/0d153888a24eb5b8c0195495cd83d0dd.jpg'>", unsafe_allow_html=True)
	# ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter
	@st.cache
	def load_model_and_tokenizer():
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
	model = BertForSequenceClassification.from_pretrained(
	"bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
	num_labels = 44,)
	model.load_state_dict(torch.load("model_last_version.pt", map_location=torch.device('cpu')))
	return model, tokenizer

	model, tokenizer = load_model_and_tokenizer()

	title = st.text_area("INPUT TITLE HERE")
	abstract = st.text_area("INPUT ABSTRACT HERE")
	# ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент
	if len(title) == 0 and len(abstract) == 0:
	st.markdown(f"Could you input paper title/abstract :)")
	elif len(title) == 0 and len(abstract) > 0:
	st.markdown(f"Could you input paper title :)")
	else:

	MAX_LEN = 64
	# Преобразуем название статьи в токены
	tokens = tokenizer(title, padding=True, truncation=True, return_tensors="pt")

	# Получаем предсказание модели для названия статьи и абстракта (если есть)
	input_ids = tokens['input_ids']
	attention_mask = tokens['attention_mask']
	logits = model(input_ids, attention_mask)[0]

	tags_names = ['Accelerator Physics',
	'adap-org',
	"adap-org",
	'Algebra-Geometry',
	'Astro-physics',
	"Astro-physics",
	'Chao-dynamics',
	'Chemistry-physics',
	'cmp-lg',
	"cmp-lg",
	'comp-gas',
	'cond-mat',
	"cond-mat",
	'Computer Science',
	'dg-ga',
	'Economics',
	'eess',
	'funct-an',
	'gr-qc',
	"gr-qc",
	'hep-ex',
	"hep-ex",
	'hep-lat',
	"hep-lat",
	'hep-ph',
	"hep-ph",
	'hep-th',
	"hep-th",
	'Math',
	'math-ph',
	'mtrl-th',
	'nlin',
	'nucl-ex',
	'nucl-th',
	"nucl-th",
	'patt-sol',
	'Physics',
	'q-alg',
	'Quantitie-biology',
	'q-fin',
	'quant-ph',
	"quant-ph",
	'solv-int',
	'Statistics']

	if abstract:
	abstract_tokens = tokenizer(abstract, padding=True, truncation=True, return_tensors="pt")
	abstract_input_ids = abstract_tokens['input_ids']
	abstract_attention_mask = abstract_tokens['attention_mask']
	abstract_logits = model(abstract_input_ids, abstract_attention_mask)[0]
	logits += abstract_logits

	# Получаем вероятности и сортируем их в порядке убывания
	probs = torch.softmax(logits, dim=-1).squeeze()
	sorted_probs, sorted_indices = torch.sort(probs, descending=True)

	# Считаем сумму вероятностей
	sum_probs = 0.0
	top_classes = []
	for i in range(len(sorted_probs)):
	sum_probs += sorted_probs[i]
	if sum_probs > 0.95 or sorted_probs[i] < 0.001:
	break
	top_classes.append((tags_names[sorted_indices[i].item()], sorted_probs[i].item()))

	# Выводим список тем с их вероятностями
	# from transformers import pipeline
	# pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl")
	raw_predictions = top_classes#le.inverse_transform(prediction)#pipe(text)
	# тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost

	st.markdown(f"Possible categories with their probabilities for this paper : {raw_predictions}")
	# выводим результаты модели в текстовое поле, на потеху пользователю