File size: 3,611 Bytes
9d659d5
0fac54c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d659d5
30c67b9
 
 
 
 
 
 
 
 
 
 
 
0fac54c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import streamlit as st
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup, AdamW
from torch.cuda.amp import autocast, GradScaler
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, \
                         BigBirdPegasusForSequenceClassification, BigBirdTokenizer
from transformers import pipeline
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import streamlit as st

import pandas as pd
import json
import ast
from scipy import stats
import numpy as np
import time
import datetime

#
def get_top95(y_predict, convert_target):
    lst_labels = []
    tuple_arr = tuple((idx, val) for idx, val in enumerate(y_predict[0]))
    sort_y = sorted(tuple_arr, key=lambda x: x[1], reverse=True)
    cumsum = 0
    for key, prob in sort_y:
        cumsum += prob
        print(prob)
        lst_labels.append(convert_target[str(key)])
        if cumsum > 0.95:
            break
    return lst_labels
#
# model = MyModel()
model = torch.load("distilbert-model1.pt", map_location='cpu').eval()
# print(model)
# model = DistilBertForSequenceClassification.from_pretrained("model/distilbert-model1.pt", local_files_only=True)
# tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-pegasus-large-arxiv')

# model = BigBirdPegasusForSequenceClassification.from_pretrained('google/bigbird-pegasus-large-arxiv',
#     num_labels=8,
#     return_dict=False)

def get_predict(text):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
    # encoded_dict = tokenizer.encode_plus(
    #     text,  # document to encode.
    #     add_special_tokens=True,  # add '[CLS]' and '[SEP]'
    #     max_length=512,  # set max length
    #     truncation=True,  # truncate longer messages
    #     pad_to_max_length=True,  # add padding
    #     return_attention_mask=True,  # create attn. masks
    #     return_tensors='pt'  # return pytorch tensors
    # )

    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
    )
    logits = outputs[0]
    y_predict = torch.nn.functional.softmax(logits).cpu().detach().numpy()
    file_path = "decode_target (1).json"

    with open(file_path, 'r') as json_file:
        decode_target = json.load(json_file)
    print(get_top95(y_predict, decode_target))
#
#
#
#
#
# get_predict('''physics physics physics physics physics
#                physics physics physics physics''')
#

st.markdown("### Hello, world!")
st.markdown("<img width=200px src='https://rozetked.me/images/uploads/dwoilp3BVjlE.jpg'>", unsafe_allow_html=True)
# ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter

text = st.text_area("TEXT HERE")
# ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент

# from transformers import pipeline
# pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl")
# raw_predictions = pipe(text)
# тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost

st.markdown("It's prediction: {get_predict(text)}")