Spaces:

NimaKL
/

spamd

Build error

File size: 3,605 Bytes

607d7d8
 
 
24f4f25
7fb9647
a6b7d4d
1e6658d
 
22a0124
9d6cafd
 
7fa544b
 
 
 
22a0124
00addfe
25cba84
 
 
 
 
 
 
 
 
 
dc256c2
25cba84
 
 
 
dc256c2
25cba84
dc256c2
 
25cba84
 
 
 
 
 
0bc4f74
 
799ec3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
023198d
 
1e6658d
9d6cafd
 
7fa544b
 
19f23ad
9d6cafd
25cba84
24065d4
872fed1

import streamlit as st
from transformers import pipeline
from textblob import TextBlob
from transformers import BertForSequenceClassification, AdamW, BertConfig
st.set_page_config(layout='wide', initial_sidebar_state='expanded')
col1, col2= st.columns(2)
placeholder = st.empty()
placeholder2 = st.empty()
with col2:
    text = placeholder.text_input("Enter the text you'd like to analyze for spam.", disabled=True, key="1")
    aButton = placeholder2.button('Analyze', disabled=True, key="1")
with col1:
    st.title("Spamd: Turkish Spam Detector")
    st.markdown("Message spam detection tool for Turkish language. Due the small size of the dataset, I decided to go with transformers         technology Google BERT. Using the Turkish pre-trained model BERTurk, I imporved the accuracy of the tool by 18 percent compared to the previous model which used fastText.")

if st.button('Load Model', disabled=False):   
    with st.spinner('Wait for it...'):
        import torch
        import numpy as np
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
        from transformers import AutoModel
        model = BertForSequenceClassification.from_pretrained("NimaKL/spamd_model")
        
        token_id = []
        attention_masks = []
        def preprocessing(input_text, tokenizer):
            '''
                  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
                    - input_ids: list of token ids
                    - token_type_ids: list of token type ids
                    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
            '''
            return tokenizer.encode_plus(
                input_text,
                add_special_tokens = True,
            max_length = 32,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt'
            )
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        with col1:
            st.success("Model Loaded!")
        def predict(new_sentence):
            # We need Token IDs and Attention Mask for inference on the new sentence
            test_ids = []
            test_attention_mask = []
            # Apply the tokenizer
            encoding = preprocessing(new_sentence, tokenizer)
            #Extract IDs and Attention Mask
            test_ids.append(encoding['input_ids'])
            test_attention_mask.append(encoding['attention_mask'])
            test_ids = torch.cat(test_ids, dim = 0)
            test_attention_mask = torch.cat(test_attention_mask, dim = 0)
            #Forward pass, calculate logit predictions
            with torch.no_grad():
                output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
                prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
                pred = 'Predicted Class: '+ prediction
                return pred
        placeholder.text_input("Enter the text you'd like to analyze for spam.", disabled=False, key="2")
        placeholder2.button('Analyze', disabled=False, key="2")       
if text or aButton:
    placeholder.text_input("Enter the text you'd like to analyze for spam.", disabled=False, key="3")
    placeholder2.button('Analyze', disabled=False, key="3") 
    with col2:
        st.header(predict(text))