Spaces:

NimaKL
/

spamd

Build error

App Files Files Community

NimaKL commited on Oct 3, 2022

Commit

fe72a06

1 Parent(s): dc010b2

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -68

app.py CHANGED Viewed

@@ -10,71 +10,67 @@ with col1:
     st.markdown("Message spam detection tool for Turkish language. Due the small size of the dataset, I decided to go with transformers         technology Google BERT. Using the Turkish pre-trained model BERTurk, I imporved the accuracy of the tool by 18 percent compared to the previous model which used fastText.")
-def predict(new_sentence):
-    # We need Token IDs and Attention Mask for inference on the new sentence
-    test_ids = []
-    test_attention_mask = []
-    # Apply the tokenizer
-    encoding = preprocessing(new_sentence, tokenizer)
-    # Extract IDs and Attention Mask
-    test_ids.append(encoding['input_ids'])
-    test_attention_mask.append(encoding['attention_mask'])
-    test_ids = torch.cat(test_ids, dim = 0)
-    test_attention_mask = torch.cat(test_attention_mask, dim = 0)
-    # Forward pass, calculate logit predictions
-    with torch.no_grad():
-      output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
-    prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
-    pred = 'Predicted Class: '+ prediction
-    with col2:
-        st.header(pred)
-with col2:
-    text = st.text_input("Enter the text you'd like to analyze for spam.")
-    if text or st.button('Analyze'):
-        predict(text)
-import torch
-import numpy as np
-from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
-from transformers import AutoModel
-model = BertForSequenceClassification.from_pretrained("NimaKL/spamd_model")
-token_id = []
-attention_masks = []
-def preprocessing(input_text, tokenizer):
-  '''
-  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
-    - input_ids: list of token ids
-    - token_type_ids: list of token type ids
-    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
-  '''
-  return tokenizer.encode_plus(
-                        input_text,
-                        add_special_tokens = True,
-                        max_length = 32,
-                        pad_to_max_length = True,
-                        return_attention_mask = True,
-                        return_tensors = 'pt'
-                   )
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-#Used for printing the name if the variables. Removing it will not intrupt the project.
-def namestr(obj, namespace):
-    return [name for name in namespace if namespace[name] is obj]
-    #st.write('Input', namestr(new_sentence, globals()),': \n', new_sentence)

     st.markdown("Message spam detection tool for Turkish language. Due the small size of the dataset, I decided to go with transformers         technology Google BERT. Using the Turkish pre-trained model BERTurk, I imporved the accuracy of the tool by 18 percent compared to the previous model which used fastText.")
+if st.button('Load Model'):
+    with st.spinner('Wait for it...'):
+        import torch
+        import numpy as np
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
+        from transformers import AutoModel
+        model = BertForSequenceClassification.from_pretrained("NimaKL/spamd_model")
+        token_id = []
+        attention_masks = []
+        def preprocessing(input_text, tokenizer):
+          '''
+          Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
+            - input_ids: list of token ids
+            - token_type_ids: list of token type ids
+            - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
+          '''
+          return tokenizer.encode_plus(
+                                input_text,
+                                add_special_tokens = True,
+                                max_length = 32,
+                                pad_to_max_length = True,
+                                return_attention_mask = True,
+                                return_tensors = 'pt'
+                           )
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        #Used for printing the name if the variables. Removing it will not intrupt the project.
+        def namestr(obj, namespace):
+            return [name for name in namespace if namespace[name] is obj]
+        def predict(new_sentence):
+            # We need Token IDs and Attention Mask for inference on the new sentence
+            test_ids = []
+            test_attention_mask = []
+            # Apply the tokenizer
+            encoding = preprocessing(new_sentence, tokenizer)
+            # Extract IDs and Attention Mask
+            test_ids.append(encoding['input_ids'])
+            test_attention_mask.append(encoding['attention_mask'])
+            test_ids = torch.cat(test_ids, dim = 0)
+            test_attention_mask = torch.cat(test_attention_mask, dim = 0)
+            # Forward pass, calculate logit predictions
+            with torch.no_grad():
+              output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
+            prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
+            pred = 'Predicted Class: '+ prediction
+            with col2:
+                st.header(pred)
+            #st.write('Input', namestr(new_sentence, globals()),': \n', new_sentence)
+           with col2:
+            text = st.text_input("Enter the text you'd like to analyze for spam.")
+            if text or st.button('Analyze'):
+                predict(text)
+    st.success()