Spaces:
Runtime error
Runtime error
import streamlit as st | |
from torch.utils.data import Dataset, DataLoader | |
import torch | |
from sklearn.model_selection import train_test_split | |
from transformers import get_linear_schedule_with_warmup, AdamW | |
from torch.cuda.amp import autocast, GradScaler | |
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, \ | |
BigBirdPegasusForSequenceClassification, BigBirdTokenizer | |
from transformers import pipeline | |
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler | |
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score | |
import streamlit as st | |
import pandas as pd | |
import json | |
import ast | |
from scipy import stats | |
import numpy as np | |
import time | |
import datetime | |
# | |
def get_top95(y_predict, convert_target): | |
lst_labels = [] | |
tuple_arr = tuple((idx, val) for idx, val in enumerate(y_predict)) | |
sort_y = sorted(tuple_arr, key=lambda x: x[1], reverse=True) | |
cumsum = 0 | |
for key, prob in sort_y: | |
cumsum += prob | |
print(prob) | |
lst_labels.append(convert_target[str(key)]) | |
if cumsum > 0.95: | |
break | |
return lst_labels | |
# | |
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. | |
from transformers import DistilBertModel, DistilBertTokenizer | |
class DistillBERTClass(torch.nn.Module): | |
def __init__(self): | |
super(DistillBERTClass, self).__init__() | |
self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased") | |
self.pre_classifier = torch.nn.Linear(768, 768) | |
self.dropout = torch.nn.Dropout(0.3) | |
self.classifier = torch.nn.Linear(768, 8) | |
def forward(self, input_ids, attention_mask): | |
output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask) | |
hidden_state = output_1[0] | |
pooler = hidden_state[:, 0] | |
pooler = self.pre_classifier(pooler) | |
pooler = torch.nn.ReLU()(pooler) | |
pooler = self.dropout(pooler) | |
output = self.classifier(pooler) | |
return output | |
model = DistillBERTClass() | |
LEARNING_RATE = 1e-05 | |
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE) | |
model = torch.load("pytorch_distilbert_news.bin", map_location=torch.device('cpu')) | |
# model.load_state_dict(checkpoint['model']) | |
# optimizer.load_state_dict(checkpoint['opt']) | |
# model.to("cpu") | |
# print(model) | |
# model = DistilBertForSequenceClassification.from_pretrained("model/distilbert-model1.pt", local_files_only=True) | |
# tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-pegasus-large-arxiv') | |
# model = BigBirdPegasusForSequenceClassification.from_pretrained('google/bigbird-pegasus-large-arxiv', | |
# num_labels=8, | |
# return_dict=False) | |
def get_predict(text): | |
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') | |
# encoded_dict = tokenizer.encode_plus( | |
# text, # document to encode. | |
# add_special_tokens=True, # add '[CLS]' and '[SEP]' | |
# max_length=512, # set max length | |
# truncation=True, # truncate longer messages | |
# pad_to_max_length=True, # add padding | |
# return_attention_mask=True, # create attn. masks | |
# return_tensors='pt' # return pytorch tensors | |
# ) | |
inputs = tokenizer(text, return_tensors="pt") | |
outputs = model( | |
input_ids=inputs['input_ids'], | |
attention_mask=inputs['attention_mask'], | |
) | |
logits = outputs[0] | |
y_predict = torch.nn.functional.softmax(logits).cpu().detach().numpy() | |
file_path = "decode_target (1).json" | |
with open(file_path, 'r') as json_file: | |
decode_target = json.load(json_file) | |
return get_top95(y_predict, decode_target) | |
# | |
# | |
# | |
# | |
# | |
# get_predict('''physics physics physics physics physics | |
# physics physics physics physics''') | |
# | |
st.markdown("### Hello, world!") | |
st.markdown("<img width=200px src='https://rozetked.me/images/uploads/dwoilp3BVjlE.jpg'>", unsafe_allow_html=True) | |
# ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter | |
text = st.text_area("TEXT HERE") | |
# ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент | |
# from transformers import pipeline | |
# pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl") | |
# raw_predictions = pipe(text) | |
# тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost | |
st.markdown(f"It's prediction: {get_predict(text)}") |