NimaKL commited on
Commit
d8a6944
Β·
1 Parent(s): 8dcda59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -64
app.py CHANGED
@@ -9,68 +9,70 @@ with col1:
9
  st.title("Spamd: Turkish Spam Detector")
10
  st.markdown("Message spam detection tool for Turkish language. Due the small size of the dataset, I decided to go with transformers technology Google BERT. Using the Turkish pre-trained model BERTurk, I imporved the accuracy of the tool by 18 percent compared to the previous model which used fastText.")
11
 
12
-
13
- if st.button('Load Model'):
14
- with st.spinner('Wait for it...'):
15
-
16
- import torch
17
- import numpy as np
18
-
19
- from transformers import AutoTokenizer
20
- tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
21
- from transformers import AutoModel
22
- model = BertForSequenceClassification.from_pretrained("NimaKL/spamd_model")
23
-
24
- token_id = []
25
- attention_masks = []
26
-
27
- def preprocessing(input_text, tokenizer):
28
- '''
29
- Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
30
- - input_ids: list of token ids
31
- - token_type_ids: list of token type ids
32
- - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
33
- '''
34
- return tokenizer.encode_plus(
35
- input_text,
36
- add_special_tokens = True,
37
- max_length = 32,
38
- pad_to_max_length = True,
39
- return_attention_mask = True,
40
- return_tensors = 'pt'
41
- )
42
-
43
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
44
- #Used for printing the name if the variables. Removing it will not intrupt the project.
45
- def namestr(obj, namespace):
46
- return [name for name in namespace if namespace[name] is obj]
47
-
48
- def predict(new_sentence):
49
- # We need Token IDs and Attention Mask for inference on the new sentence
50
- test_ids = []
51
- test_attention_mask = []
52
-
53
- # Apply the tokenizer
54
- encoding = preprocessing(new_sentence, tokenizer)
55
-
56
- # Extract IDs and Attention Mask
57
- test_ids.append(encoding['input_ids'])
58
- test_attention_mask.append(encoding['attention_mask'])
59
- test_ids = torch.cat(test_ids, dim = 0)
60
- test_attention_mask = torch.cat(test_attention_mask, dim = 0)
61
-
62
- # Forward pass, calculate logit predictions
63
- with torch.no_grad():
64
- output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
65
-
66
- prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
67
- pred = 'Predicted Class: '+ prediction
68
- with col2:
69
- st.header(pred)
70
 
71
- #st.write('Input', namestr(new_sentence, globals()),': \n', new_sentence)
72
- with col2:
73
- text = st.text_input("Enter the text you'd like to analyze for spam.")
74
- if text or st.button('Analyze'):
75
- predict(text)
76
- st.success("Model Loaded!")
 
 
 
 
 
 
 
9
  st.title("Spamd: Turkish Spam Detector")
10
  st.markdown("Message spam detection tool for Turkish language. Due the small size of the dataset, I decided to go with transformers technology Google BERT. Using the Turkish pre-trained model BERTurk, I imporved the accuracy of the tool by 18 percent compared to the previous model which used fastText.")
11
 
12
+ with col2:
13
+ st.title("Spamd: Turkish Spam Detector")
14
+ if st.button('Load Model'):
15
+ with st.spinner('Wait for it...'):
16
+
17
+ import torch
18
+ import numpy as np
19
+
20
+ from transformers import AutoTokenizer
21
+ tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
22
+ from transformers import AutoModel
23
+ model = BertForSequenceClassification.from_pretrained("NimaKL/spamd_model")
24
+
25
+ token_id = []
26
+ attention_masks = []
27
+
28
+ def preprocessing(input_text, tokenizer):
29
+ '''
30
+ Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
31
+ - input_ids: list of token ids
32
+ - token_type_ids: list of token type ids
33
+ - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
34
+ '''
35
+ return tokenizer.encode_plus(
36
+ input_text,
37
+ add_special_tokens = True,
38
+ max_length = 32,
39
+ pad_to_max_length = True,
40
+ return_attention_mask = True,
41
+ return_tensors = 'pt'
42
+ )
43
+
44
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
45
+ #Used for printing the name if the variables. Removing it will not intrupt the project.
46
+ def namestr(obj, namespace):
47
+ return [name for name in namespace if namespace[name] is obj]
48
+
49
+ def predict(new_sentence):
50
+ # We need Token IDs and Attention Mask for inference on the new sentence
51
+ test_ids = []
52
+ test_attention_mask = []
53
+
54
+ # Apply the tokenizer
55
+ encoding = preprocessing(new_sentence, tokenizer)
56
+
57
+ # Extract IDs and Attention Mask
58
+ test_ids.append(encoding['input_ids'])
59
+ test_attention_mask.append(encoding['attention_mask'])
60
+ test_ids = torch.cat(test_ids, dim = 0)
61
+ test_attention_mask = torch.cat(test_attention_mask, dim = 0)
62
+
63
+ # Forward pass, calculate logit predictions
64
+ with torch.no_grad():
65
+ output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
 
 
 
 
66
 
67
+ prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
68
+ pred = 'Predicted Class: '+ prediction
69
+ with col2:
70
+ st.header(pred)
71
+
72
+ #st.write('Input', namestr(new_sentence, globals()),': \n', new_sentence)
73
+ with col2:
74
+ text = st.text_input("Enter the text you'd like to analyze for spam.")
75
+ if text or st.button('Analyze'):
76
+ predict(text)
77
+ st.success("Model Loaded!")
78
+