File size: 4,771 Bytes
9d659d5
0fac54c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a68b645
0fac54c
 
 
 
 
 
 
 
 
 
a331b68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2a33af
 
 
010c408
 
a2a33af
b6cef83
 
 
 
2ad63d4
0fac54c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d470d7b
0fac54c
 
 
 
 
 
 
 
9d659d5
30c67b9
 
 
 
 
 
 
 
 
 
 
 
b0ae7c0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup, AdamW
from torch.cuda.amp import autocast, GradScaler
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, \
                         BigBirdPegasusForSequenceClassification, BigBirdTokenizer
from transformers import pipeline
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import streamlit as st

import pandas as pd
import json
import ast
from scipy import stats
import numpy as np
import time
import datetime

#
def get_top95(y_predict, convert_target):
    lst_labels = []
    tuple_arr = tuple((idx, val) for idx, val in enumerate(y_predict))
    sort_y = sorted(tuple_arr, key=lambda x: x[1], reverse=True)
    cumsum = 0
    for key, prob in sort_y:
        cumsum += prob
        print(prob)
        lst_labels.append(convert_target[str(key)])
        if cumsum > 0.95:
            break
    return lst_labels
#
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 
from transformers import DistilBertModel, DistilBertTokenizer

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 8)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output


model = DistillBERTClass()
LEARNING_RATE = 1e-05

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
model = torch.load("pytorch_distilbert_news.bin", map_location=torch.device('cpu'))
# model.load_state_dict(checkpoint['model'])
# optimizer.load_state_dict(checkpoint['opt']) 
# model.to("cpu")

# print(model)
# model = DistilBertForSequenceClassification.from_pretrained("model/distilbert-model1.pt", local_files_only=True)
# tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-pegasus-large-arxiv')

# model = BigBirdPegasusForSequenceClassification.from_pretrained('google/bigbird-pegasus-large-arxiv',
#     num_labels=8,
#     return_dict=False)

def get_predict(text):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
    # encoded_dict = tokenizer.encode_plus(
    #     text,  # document to encode.
    #     add_special_tokens=True,  # add '[CLS]' and '[SEP]'
    #     max_length=512,  # set max length
    #     truncation=True,  # truncate longer messages
    #     pad_to_max_length=True,  # add padding
    #     return_attention_mask=True,  # create attn. masks
    #     return_tensors='pt'  # return pytorch tensors
    # )

    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
    )
    logits = outputs[0]
    y_predict = torch.nn.functional.softmax(logits).cpu().detach().numpy()
    file_path = "decode_target (1).json"

    with open(file_path, 'r') as json_file:
        decode_target = json.load(json_file)
    return get_top95(y_predict, decode_target)
#
#
#
#
#
# get_predict('''physics physics physics physics physics
#                physics physics physics physics''')
#

st.markdown("### Hello, world!")
st.markdown("<img width=200px src='https://rozetked.me/images/uploads/dwoilp3BVjlE.jpg'>", unsafe_allow_html=True)
# ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter

text = st.text_area("TEXT HERE")
# ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент

# from transformers import pipeline
# pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl")
# raw_predictions = pipe(text)
# тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost

st.markdown(f"It's prediction: {get_predict(text)}")