HamidRezaei's picture
Create app.py
bcb1984 verified
raw
history blame
3.35 kB
Hugging Face's logo
Hugging Face
Search models, datasets, users...
Models
Datasets
Spaces
Posts
Docs
Solutions
Pricing
Spaces:
Asa-AI-Lab
/
Offensive-Detection-Space
private
Logs
App
Files
Community
Settings
Offensive-Detection-Space
/
app.py
hafez97's picture
hafez97
Update app.py
b244916
verified
13 days ago
raw
Copy download link
history
blame
edit
delete
2.96 kB
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
import torch
from cleantext import clean
import hazm
import re
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def cleaning(text):
text = text.strip()
# regular cleaning
text = clean(text,
clean_all=True,
punct=True,
stopwords=True,
stemming=True,
extra_spaces=True
)
# cleaning htmls
text = cleanhtml(text)
# normalizing
normalizer = hazm.Normalizer()
text = normalizer.normalize(text)
# removing wierd patterns
wierd_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u'\U00010000-\U0010ffff'
u"\u200d"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\u3030"
u"\ufe0f"
u"\u2069"
u"\u2066"
# u"\u200c"
u"\u2068"
u"\u2067"
"]+", flags=re.UNICODE)
text = wierd_pattern.sub(r'', text)
# removing extra spaces, hashtags
text = re.sub("#", "", text)
text = re.sub("\s+", " ", text)
return text
access_token = os.getenv('ACCESS_TOKEN')
tokenizer = AutoTokenizer.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection-Lora", token=access_token)
model = AutoModelForSequenceClassification.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection-Lora", token=access_token)
st.title("Offensive or Not?")
prompt = st.text_area(label="Send a message")
button = st.button("send")
if prompt:
normalized_prompt = cleaning(prompt)
encoding = tokenizer(normalized_prompt, return_tensors="pt")
encoding = {k: v.to(model.device) for k,v in encoding.items()}
outputs = model(**encoding)
logits = outputs.logits
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
score = probs.item()
st.markdown(f"Offensive: score {score}" if score > 0.5 else f"Not Offensive: score {score}")