File size: 4,877 Bytes
9d659d5
0fac54c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a68b645
0fac54c
 
 
 
 
 
 
 
 
 
a331b68
 
 
 
b6cef83
 
 
2ad63d4
0fac54c
 
 
 
 
 
 
 
b58d194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e992d7
0fac54c
 
 
 
 
 
 
 
 
 
 
5e992d7
0fac54c
 
 
 
 
7895958
0fac54c
2ab0be5
0fac54c
 
 
d470d7b
0fac54c
 
 
 
 
 
 
 
9d659d5
30c67b9
 
 
 
af9fdfa
 
5e992d7
30c67b9
 
 
 
 
 
 
5e992d7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import streamlit as st
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup, AdamW
from torch.cuda.amp import autocast, GradScaler
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, \
                         BigBirdPegasusForSequenceClassification, BigBirdTokenizer
from transformers import pipeline
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import streamlit as st

import pandas as pd
import json
import ast
from scipy import stats
import numpy as np
import time
import datetime

#
def get_top95(y_predict, convert_target):
    lst_labels = []
    tuple_arr = tuple((idx, val) for idx, val in enumerate(y_predict))
    sort_y = sorted(tuple_arr, key=lambda x: x[1], reverse=True)
    cumsum = 0
    for key, prob in sort_y:
        cumsum += prob
        print(prob)
        lst_labels.append(convert_target[str(key)])
        if cumsum > 0.95:
            break
    return lst_labels
#
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 
from transformers import DistilBertModel, DistilBertTokenizer


# model.load_state_dict(checkpoint['model'])
# optimizer.load_state_dict(checkpoint['opt']) 
# model.to("cpu")

# print(model)
# model = DistilBertForSequenceClassification.from_pretrained("model/distilbert-model1.pt", local_files_only=True)
# tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-pegasus-large-arxiv')

# model = BigBirdPegasusForSequenceClassification.from_pretrained('google/bigbird-pegasus-large-arxiv',
#     num_labels=8,
#     return_dict=False)



class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 8)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output


model = DistillBERTClass()
LEARNING_RATE = 1e-05

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)



model = torch.load("pytorch_distilbert_news (3).bin", map_location=torch.device('cpu'))


def get_predict(title, abstract):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
    # encoded_dict = tokenizer.encode_plus(
    #     text,  # document to encode.
    #     add_special_tokens=True,  # add '[CLS]' and '[SEP]'
    #     max_length=512,  # set max length
    #     truncation=True,  # truncate longer messages
    #     pad_to_max_length=True,  # add padding
    #     return_attention_mask=True,  # create attn. masks
    #     return_tensors='pt'  # return pytorch tensors
    # )

    inputs = tokenizer(title, abstract,  return_tensors="pt")
    outputs = model(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
    )
    logits = outputs[0]
    print(logits)
    y_predict = torch.nn.functional.softmax(logits).cpu().detach().numpy()
    file_path = "sample.json"

    with open(file_path, 'r') as json_file:
        decode_target = json.load(json_file)
    return get_top95(y_predict, decode_target)
#
#
#
#
#
# get_predict('''physics physics physics physics physics
#                physics physics physics physics''')
#

st.markdown("### Hello, world!")
st.markdown("<img width=200px src='https://rozetked.me/images/uploads/dwoilp3BVjlE.jpg'>", unsafe_allow_html=True)
# ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter

title = st.text_area("TEXT HERE", key=1)
abstract = st.text_area("TEXT HERE", key=2)

# ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент

# from transformers import pipeline
# pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl")
# raw_predictions = pipe(text)
# тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost

st.markdown(f"It's prediction: {get_predict(title, abstract)}")