Spaces:

NimaKL
/

spamd

Build error

App Files Files Community

spamd / app.py

NimaKL

Update app.py

1e6658d almost 3 years ago

raw

history blame

3.48 kB

	import streamlit as st
	from transformers import pipeline
	from textblob import TextBlob
	from transformers import BertForSequenceClassification, AdamW, BertConfig
	st.set_page_config(layout='wide', initial_sidebar_state='expanded')
	col1, col2= st.columns(2)
	placeholder = st.empty()
	placeholder2 = st.empty()
	with col1:
	st.title("Spamd: Turkish Spam Detector")
	st.markdown("Message spam detection tool for Turkish language. Due the small size of the dataset, I decided to go with transformers technology Google BERT. Using the Turkish pre-trained model BERTurk, I imporved the accuracy of the tool by 18 percent compared to the previous model which used fastText.")
	with col2:
	text = placeholder.text_input("Enter the text you'd like to analyze for spam.", disabled=True, key="1")
	aButton = placeholder2.button('Analyze', disabled=True, key="1")
	if st.button('Load Model', disabled=False):
	with st.spinner('Wait for it...'):
	import torch
	import numpy as np
	from transformers import AutoTokenizer
	tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
	from transformers import AutoModel
	model = BertForSequenceClassification.from_pretrained("NimaKL/spamd_model")

	token_id = []
	attention_masks = []
	def preprocessing(input_text, tokenizer):
	'''
	Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
	- input_ids: list of token ids
	- token_type_ids: list of token type ids
	- attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
	'''
	return tokenizer.encode_plus(
	input_text,
	add_special_tokens = True,
	max_length = 32,
	pad_to_max_length = True,
	return_attention_mask = True,
	return_tensors = 'pt'
	)
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	with col1:
	st.success("Model Loaded!")
	def predict(new_sentence):
	# We need Token IDs and Attention Mask for inference on the new sentence
	test_ids = []
	test_attention_mask = []
	# Apply the tokenizer
	encoding = preprocessing(new_sentence, tokenizer)
	#Extract IDs and Attention Mask
	test_ids.append(encoding['input_ids'])
	test_attention_mask.append(encoding['attention_mask'])
	test_ids = torch.cat(test_ids, dim = 0)
	test_attention_mask = torch.cat(test_attention_mask, dim = 0)
	#Forward pass, calculate logit predictions
	with torch.no_grad():
	output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
	prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
	pred = 'Predicted Class: '+ prediction
	return pred
	placeholder.text_input("Enter the text you'd like to analyze for spam.", disabled=False, key="2")
	placeholder2.button('Analyze', disabled=False, key="2")
	if not flag:
	with col2:
	if text or aButton:
	st.header(predict(text))