Spaces:
Runtime error
Runtime error
File size: 4,877 Bytes
9d659d5 0fac54c a68b645 0fac54c a331b68 b6cef83 2ad63d4 0fac54c b58d194 5e992d7 0fac54c 5e992d7 0fac54c 7895958 0fac54c 2ab0be5 0fac54c d470d7b 0fac54c 9d659d5 30c67b9 af9fdfa 5e992d7 30c67b9 5e992d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import streamlit as st
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup, AdamW
from torch.cuda.amp import autocast, GradScaler
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, \
BigBirdPegasusForSequenceClassification, BigBirdTokenizer
from transformers import pipeline
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import streamlit as st
import pandas as pd
import json
import ast
from scipy import stats
import numpy as np
import time
import datetime
#
def get_top95(y_predict, convert_target):
lst_labels = []
tuple_arr = tuple((idx, val) for idx, val in enumerate(y_predict))
sort_y = sorted(tuple_arr, key=lambda x: x[1], reverse=True)
cumsum = 0
for key, prob in sort_y:
cumsum += prob
print(prob)
lst_labels.append(convert_target[str(key)])
if cumsum > 0.95:
break
return lst_labels
#
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.
from transformers import DistilBertModel, DistilBertTokenizer
# model.load_state_dict(checkpoint['model'])
# optimizer.load_state_dict(checkpoint['opt'])
# model.to("cpu")
# print(model)
# model = DistilBertForSequenceClassification.from_pretrained("model/distilbert-model1.pt", local_files_only=True)
# tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-pegasus-large-arxiv')
# model = BigBirdPegasusForSequenceClassification.from_pretrained('google/bigbird-pegasus-large-arxiv',
# num_labels=8,
# return_dict=False)
class DistillBERTClass(torch.nn.Module):
def __init__(self):
super(DistillBERTClass, self).__init__()
self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
self.pre_classifier = torch.nn.Linear(768, 768)
self.dropout = torch.nn.Dropout(0.3)
self.classifier = torch.nn.Linear(768, 8)
def forward(self, input_ids, attention_mask):
output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
hidden_state = output_1[0]
pooler = hidden_state[:, 0]
pooler = self.pre_classifier(pooler)
pooler = torch.nn.ReLU()(pooler)
pooler = self.dropout(pooler)
output = self.classifier(pooler)
return output
model = DistillBERTClass()
LEARNING_RATE = 1e-05
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
model = torch.load("pytorch_distilbert_news (3).bin", map_location=torch.device('cpu'))
def get_predict(title, abstract):
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
# encoded_dict = tokenizer.encode_plus(
# text, # document to encode.
# add_special_tokens=True, # add '[CLS]' and '[SEP]'
# max_length=512, # set max length
# truncation=True, # truncate longer messages
# pad_to_max_length=True, # add padding
# return_attention_mask=True, # create attn. masks
# return_tensors='pt' # return pytorch tensors
# )
inputs = tokenizer(title, abstract, return_tensors="pt")
outputs = model(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
)
logits = outputs[0]
print(logits)
y_predict = torch.nn.functional.softmax(logits).cpu().detach().numpy()
file_path = "sample.json"
with open(file_path, 'r') as json_file:
decode_target = json.load(json_file)
return get_top95(y_predict, decode_target)
#
#
#
#
#
# get_predict('''physics physics physics physics physics
# physics physics physics physics''')
#
st.markdown("### Hello, world!")
st.markdown("<img width=200px src='https://rozetked.me/images/uploads/dwoilp3BVjlE.jpg'>", unsafe_allow_html=True)
# ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter
title = st.text_area("TEXT HERE", key=1)
abstract = st.text_area("TEXT HERE", key=2)
# ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент
# from transformers import pipeline
# pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl")
# raw_predictions = pipe(text)
# тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost
st.markdown(f"It's prediction: {get_predict(title, abstract)}") |