abdulmatinomotoso's picture
Update app.py
846bf7a verified
raw
history blame
1.82 kB
import gradio as gr
import numpy as np
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
#Defining the models and tokenuzer
model_name = "valurank/distilroberta-spam-comments-detection"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def clean_text(raw_text):
text = raw_text.encode("ascii", errors="ignore").decode(
"ascii"
) # remove non-ascii, Chinese characters
text = re.sub(r"\n", " ", text)
text = re.sub(r"\n\n", " ", text)
text = re.sub(r"\t", " ", text)
text = text.strip(" ")
text = re.sub(
" +", " ", text
).strip() # get rid of multiple spaces and replace with a single
text = re.sub(r"Date\s\d{1,2}\/\d{1,2}\/\d{4}", "", text) #remove date
text = re.sub(r"\d{1,2}:\d{2}\s[A-Z]+\s[A-Z]+", "", text) #remove time
return text
#Defining a function to get the category of the news article
def get_category(text):
text = clean_text(text)
input_tensor = tokenizer.encode(text, return_tensors="pt", truncation=True)
input_tensor = input_tensor.to(device)
logits = model(input_tensor).logits
softmax = torch.nn.Softmax(dim=1)
probs = softmax(logits)[0]
p = probs.cpu().detach().numpy()
pred = {l: p[int(i)] for i, l in model.config.id2label.items()}
category = max(pred, key=lambda k: pred[k])
return category
#Creating the interface for the radio app
demo = gr.Interface(get_category, inputs=gr.Textbox(label="Drop your comment here"),
outputs = "text",
title="Spam comments detection")
#Launching the gradio app
if __name__ == "__main__":
demo.launch(debug=True)