File size: 2,536 Bytes
bc92274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b22c372
bc92274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import streamlit as st
from annotated_text import annotated_text

import os
import torch.nn as nn
import torch.nn.functional as F
import torch 
import torch.optim as optim
from transformers import DistilBertModel
from transformers import AutoTokenizer
import lightning.pytorch as pl
class Classifier(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.ln1 = torch.nn.Linear(512*768, 3)
        # self.ln2 = torch.nn.Linear(1000, 3 )
        self.criterion = nn.CrossEntropyLoss()
    def training_step(self, batch, batch_idx):
        x, y = batch
        with torch.no_grad():
            x = get_bert()(input_ids = x[:,:512], attention_mask = x[:,512:]).last_hidden_state.reshape(-1, 512*768)
            x = (x/torch.linalg.norm(x,2, 1)).reshape(-1,512*768)
        x = self.ln1(x)
        # x = self.ln2(x)
        loss = self.criterion(x, y)
        self.log("my_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
    def preprocess(self, x):
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)
        return tokenizer(x, padding='max_length',  return_tensors="pt")
    def forward(self, x):
        print("here!", self.ln1.type)
        with torch.no_grad():
            x = get_bert()(**x).last_hidden_state.reshape(-1, 512*768)
            x = (x/torch.linalg.norm(x,2, 1)).reshape(-1,512*768)
            x = self.ln1(x)
            # x = self.ln2(x)
            return x

@st.cache
def get_bert():
    return DistilBertModel.from_pretrained("distilbert-base-uncased")

@st.cache
def get_classifier():
    os.system('gdown 1GxhHvg3lwlGpA7So06v3l43U8pSASy9L')
    return Classifier.load_from_checkpoint(f"{os.getcwd()}/model_params")

def get_annotated_text(text):
    model = get_classifier()
    text = text.split(".")
    l = []

    for i in text:
        if i.strip(' ') == '':
            continue
        c = model(model.preprocess([i])).argmax()
        print("class : ", c)
        if c == 0:
            l.append((i, "Leadership"))
        if c == 1:
            l.append((i, "Diversity")) 
        if c == 2:
            l.append((i, "Integrity"))
        l.append(".")
    return tuple(l)

st.title("Code of Conduct Classifier")

input_text = st.text_area("enter code of conduct text" )

st.title("annotated text")
print(input_text)

annotated_text(*get_annotated_text(input_text))