Spaces:

HamidRezaei
/

Offensive-Detection-Lora

Sleeping

App Files Files Community

Offensive-Detection-Lora / app.py

HamidRezaei

Create app.py

bcb1984 verified 8 months ago

raw

history blame

3.35 kB

	Hugging Face's logo
	Hugging Face
	Search models, datasets, users...
	Models
	Datasets
	Spaces
	Posts
	Docs
	Solutions
	Pricing



	Spaces:

	Asa-AI-Lab
	/
	Offensive-Detection-Space

	private

	Logs
	App
	Files
	Community
	Settings
	Offensive-Detection-Space
	/
	app.py

	hafez97's picture
	hafez97
	Update app.py
	b244916
	verified
	13 days ago
	raw

	Copy download link
	history
	blame
	edit
	delete

	2.96 kB
	import streamlit as st
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import os
	import torch

	from cleantext import clean
	import hazm
	import re

	def cleanhtml(raw_html):
	cleanr = re.compile('<.*?>')
	cleantext = re.sub(cleanr, '', raw_html)
	return cleantext


	def cleaning(text):
	text = text.strip()

	# regular cleaning
	text = clean(text,
	clean_all=True,
	punct=True,
	stopwords=True,
	stemming=True,
	extra_spaces=True
	)

	# cleaning htmls
	text = cleanhtml(text)

	# normalizing
	normalizer = hazm.Normalizer()
	text = normalizer.normalize(text)

	# removing wierd patterns
	wierd_pattern = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	u"\U0001f926-\U0001f937"
	u'\U00010000-\U0010ffff'
	u"\u200d"
	u"\u2640-\u2642"
	u"\u2600-\u2B55"
	u"\u23cf"
	u"\u23e9"
	u"\u231a"
	u"\u3030"
	u"\ufe0f"
	u"\u2069"
	u"\u2066"
	# u"\u200c"
	u"\u2068"
	u"\u2067"
	"]+", flags=re.UNICODE)

	text = wierd_pattern.sub(r'', text)

	# removing extra spaces, hashtags
	text = re.sub("#", "", text)
	text = re.sub("\s+", " ", text)

	return text

	access_token = os.getenv('ACCESS_TOKEN')
	tokenizer = AutoTokenizer.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection-Lora", token=access_token)
	model = AutoModelForSequenceClassification.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection-Lora", token=access_token)

	st.title("Offensive or Not?")
	prompt = st.text_area(label="Send a message")
	button = st.button("send")

	if prompt:
	normalized_prompt = cleaning(prompt)

	encoding = tokenizer(normalized_prompt, return_tensors="pt")
	encoding = {k: v.to(model.device) for k,v in encoding.items()}

	outputs = model(**encoding)
	logits = outputs.logits

	# apply sigmoid + threshold
	sigmoid = torch.nn.Sigmoid()
	probs = sigmoid(logits.squeeze().cpu())
	score = probs.item()
	st.markdown(f"Offensive: score {score}" if score > 0.5 else f"Not Offensive: score {score}")