Spaces:

valurank
/

spam_comment_detection

Runtime error

App Files Files Community

spam_comment_detection / app.py

abdulmatinomotoso

Update app.py

846bf7a verified 11 months ago

raw

history blame

1.82 kB

	import gradio as gr
	import numpy as np
	import pandas as pd
	import re
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch


	#Defining the models and tokenuzer
	model_name = "valurank/distilroberta-spam-comments-detection"
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	def clean_text(raw_text):
	text = raw_text.encode("ascii", errors="ignore").decode(
	"ascii"
	) # remove non-ascii, Chinese characters

	text = re.sub(r"\n", " ", text)
	text = re.sub(r"\n\n", " ", text)
	text = re.sub(r"\t", " ", text)
	text = text.strip(" ")
	text = re.sub(
	" +", " ", text
	).strip() # get rid of multiple spaces and replace with a single

	text = re.sub(r"Date\s\d{1,2}\/\d{1,2}\/\d{4}", "", text) #remove date
	text = re.sub(r"\d{1,2}:\d{2}\s[A-Z]+\s[A-Z]+", "", text) #remove time

	return text


	#Defining a function to get the category of the news article
	def get_category(text):
	text = clean_text(text)

	input_tensor = tokenizer.encode(text, return_tensors="pt", truncation=True)
	input_tensor = input_tensor.to(device)
	logits = model(input_tensor).logits

	softmax = torch.nn.Softmax(dim=1)
	probs = softmax(logits)[0]
	p = probs.cpu().detach().numpy()
	pred = {l: p[int(i)] for i, l in model.config.id2label.items()}
	category = max(pred, key=lambda k: pred[k])

	return category

	#Creating the interface for the radio app
	demo = gr.Interface(get_category, inputs=gr.Textbox(label="Drop your comment here"),
	outputs = "text",
	title="Spam comments detection")


	#Launching the gradio app
	if __name__ == "__main__":
	demo.launch(debug=True)