Spaces:
Sleeping
Sleeping
import numpy as np | |
import torch | |
import streamlit as st | |
from transformers import BertTokenizer | |
from transformers import BertForSequenceClassification | |
from sklearn.preprocessing import LabelEncoder | |
from keras.utils import pad_sequences | |
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler | |
st.markdown("### Paper category classification") | |
st.markdown("<img width=200px src='https://grandgames.net/img/upload/0d153888a24eb5b8c0195495cd83d0dd.jpg'>", unsafe_allow_html=True) | |
# ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter | |
def load_model_and_tokenizer(): | |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
model = BertForSequenceClassification.from_pretrained( | |
"bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. | |
num_labels = 44,) | |
model.load_state_dict(torch.load("model_last_version.pt", map_location=torch.device('cpu'))) | |
return model, tokenizer | |
model, tokenizer = load_model_and_tokenizer() | |
title = st.text_area("INPUT TITLE HERE") | |
abstract = st.text_area("INPUT ABSTRACT HERE") | |
# ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент | |
if len(title) == 0 and len(abstract) == 0: | |
st.markdown(f"Could you input paper title/abstract :)") | |
elif len(title) == 0 and len(abstract) > 0: | |
st.markdown(f"Could you input paper title :)") | |
else: | |
MAX_LEN = 64 | |
# Преобразуем название статьи в токены | |
tokens = tokenizer(title, padding=True, truncation=True, return_tensors="pt") | |
# Получаем предсказание модели для названия статьи и абстракта (если есть) | |
input_ids = tokens['input_ids'] | |
attention_mask = tokens['attention_mask'] | |
logits = model(input_ids, attention_mask)[0] | |
tags_names = ['Accelerator Physics', | |
'adap-org', | |
"adap-org", | |
'Algebra-Geometry', | |
'Astro-physics', | |
"Astro-physics", | |
'Chao-dynamics', | |
'Chemistry-physics', | |
'cmp-lg', | |
"cmp-lg", | |
'comp-gas', | |
'cond-mat', | |
"cond-mat", | |
'Computer Science', | |
'dg-ga', | |
'Economics', | |
'eess', | |
'funct-an', | |
'gr-qc', | |
"gr-qc", | |
'hep-ex', | |
"hep-ex", | |
'hep-lat', | |
"hep-lat", | |
'hep-ph', | |
"hep-ph", | |
'hep-th', | |
"hep-th", | |
'Math', | |
'math-ph', | |
'mtrl-th', | |
'nlin', | |
'nucl-ex', | |
'nucl-th', | |
"nucl-th", | |
'patt-sol', | |
'Physics', | |
'q-alg', | |
'Quantitie-biology', | |
'q-fin', | |
'quant-ph', | |
"quant-ph", | |
'solv-int', | |
'Statistics'] | |
if abstract: | |
abstract_tokens = tokenizer(abstract, padding=True, truncation=True, return_tensors="pt") | |
abstract_input_ids = abstract_tokens['input_ids'] | |
abstract_attention_mask = abstract_tokens['attention_mask'] | |
abstract_logits = model(abstract_input_ids, abstract_attention_mask)[0] | |
logits += abstract_logits | |
# Получаем вероятности и сортируем их в порядке убывания | |
probs = torch.softmax(logits, dim=-1).squeeze() | |
sorted_probs, sorted_indices = torch.sort(probs, descending=True) | |
# Считаем сумму вероятностей | |
sum_probs = 0.0 | |
top_classes = [] | |
for i in range(len(sorted_probs)): | |
sum_probs += sorted_probs[i] | |
if sum_probs > 0.95 or sorted_probs[i] < 0.001: | |
break | |
top_classes.append((tags_names[sorted_indices[i].item()], sorted_probs[i].item())) | |
# Выводим список тем с их вероятностями | |
# from transformers import pipeline | |
# pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl") | |
raw_predictions = top_classes#le.inverse_transform(prediction)#pipe(text) | |
# тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost | |
st.markdown(f"Possible categories with their probabilities for this paper : {raw_predictions}") | |
# выводим результаты модели в текстовое поле, на потеху пользователю |