Spaces:

MatteoFasulo
/

Sexism-Detection-Dashboard

Running

App Files Files Community

Sexism-Detection-Dashboard / app.py

MatteoFasulo

Add application file

976b6b9 23 days ago

raw

history blame

7.18 kB

	import re
	import unicodedata
	import nltk
	from nltk import WordNetLemmatizer
	from datasets import Dataset
	from transformers import AutoTokenizer
	from transformers import AutoModelForSequenceClassification
	from transformers import XLMRobertaForSequenceClassification
	from transformers import Trainer
	import gradio as gr

	def preprocess_text(text: str) -> str:
	"""
	Preprocesses the input text by removing or replacing specific patterns.

	Args:
	text (str): The input text to be preprocessed.

	Returns:
	str: The preprocessed text with URLs, mentions, hashtags, emojis,
	special characters removed, 'and' replaced, and extra spaces trimmed.
	"""
	# Define patterns
	URL_PATTERN_STR = r"""(?i)((?:https?:(?:/{1,3}\|[a-z0-9%])\|[a-z0-9.\-]+[.](?:com\|net\|org\|edu\|gov\|mil\|aero\|asia\|biz\|cat\|coop\|info
	\|int\|jobs\|mobi\|museum\|name\|post\|pro\|tel\|travel\|xxx\|ac\|ad\|ae\|af\|ag\|ai\|al\|am\|an\|ao\|aq\|ar\|as\|at\|au\|aw\|ax\|az\|ba\|
	bb\|bd\|be\|bf\|bg\|bh\|bi\|bj\|bm\|bn\|bo\|br\|bs\|bt\|bv\|bw\|by\|bz\|ca\|cc\|cd\|cf\|cg\|ch\|ci\|ck\|cl\|cm\|cn\|co\|cr\|cs\|cu\|cv\|cx\|cy\|
	cz\|dd\|de\|dj\|dk\|dm\|do\|dz\|ec\|ee\|eg\|eh\|er\|es\|et\|eu\|fi\|fj\|fk\|fm\|fo\|fr\|ga\|gb\|gd\|ge\|gf\|gg\|gh\|gi\|gl\|gm\|gn\|gp\|gq\|gr\|
	gs\|gt\|gu\|gw\|gy\|hk\|hm\|hn\|hr\|ht\|hu\|id\|ie\|il\|im\|in\|io\|iq\|ir\|is\|it\|je\|jm\|jo\|jp\|ke\|kg\|kh\|ki\|km\|kn\|kp\|kr\|kw\|ky\|kz\|
	la\|lb\|lc\|li\|lk\|lr\|ls\|lt\|lu\|lv\|ly\|ma\|mc\|md\|me\|mg\|mh\|mk\|ml\|mm\|mn\|mo\|mp\|mq\|mr\|ms\|mt\|mu\|mv\|mw\|mx\|my\|mz\|na\|nc\|ne\|
	nf\|ng\|ni\|nl\|no\|np\|nr\|nu\|nz\|om\|pa\|pe\|pf\|pg\|ph\|pk\|pl\|pm\|pn\|pr\|ps\|pt\|pw\|py\|qa\|re\|ro\|rs\|ru\|rw\|sa\|sb\|sc\|sd\|se\|sg\|
	sh\|si\|sj\|Ja\|sk\|sl\|sm\|sn\|so\|sr\|ss\|st\|su\|sv\|sx\|sy\|sz\|tc\|td\|tf\|tg\|th\|tj\|tk\|tl\|tm\|tn\|to\|tp\|tr\|tt\|tv\|tw\|tz\|ua\|ug\|
	uk\|us\|uy\|uz\|va\|vc\|ve\|vg\|vi\|vn\|vu\|wf\|ws\|ye\|yt\|yu\|za\|zm\|zw)/)(?:[^\s()<>{}\[\]]+\|\([^\s()]*?\([^\s()]+\)[^\s()]
	?\)\|\([^\s]+?\))+(?:\([^\s()]?\([^\s()]+\)[^\s()]*?\)\|\([^\s]+?\)\|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])\|(?:(?<!@)
	[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com\|net\|org\|edu\|gov\|mil\|aero\|asia\|biz\|cat\|coop\|info\|int\|jobs\|mobi\|museum\|name
	\|post\|pro\|tel\|travel\|xxx\|ac\|ad\|ae\|af\|ag\|ai\|al\|am\|an\|ao\|aq\|ar\|as\|at\|au\|aw\|ax\|az\|ba\|bb\|bd\|be\|bf\|bg\|bh\|bi\|bj\|bm\|bn
	\|bo\|br\|bs\|bt\|bv\|bw\|by\|bz\|ca\|cc\|cd\|cf\|cg\|ch\|ci\|ck\|cl\|cm\|cn\|co\|cr\|cs\|cu\|cv\|cx\|cy\|cz\|dd\|de\|dj\|dk\|dm\|do\|dz\|ec\|ee\|eg
	\|eh\|er\|es\|et\|eu\|fi\|fj\|fk\|fm\|fo\|fr\|ga\|gb\|gd\|ge\|gf\|gg\|gh\|gi\|gl\|gm\|gn\|gp\|gq\|gr\|gs\|gt\|gu\|gw\|gy\|hk\|hm\|hn\|hr\|ht\|hu\|id
	\|ie\|il\|im\|in\|io\|iq\|ir\|is\|it\|je\|jm\|jo\|jp\|ke\|kg\|kh\|ki\|km\|kn\|kp\|kr\|kw\|ky\|kz\|la\|lb\|lc\|li\|lk\|lr\|ls\|lt\|lu\|lv\|ly\|ma\|mc\|
	md\|me\|mg\|mh\|mk\|ml\|mm\|mn\|mo\|mp\|mq\|mr\|ms\|mt\|mu\|mv\|mw\|mx\|my\|mz\|na\|nc\|ne\|nf\|ng\|ni\|nl\|no\|np\|nr\|nu\|nz\|om\|pa\|pe\|pf\|pg\|
	ph\|pk\|pl\|pm\|pn\|pr\|ps\|pt\|pw\|py\|qa\|re\|ro\|rs\|ru\|rw\|sa\|sb\|sc\|sd\|se\|sg\|sh\|si\|sj\|Ja\|sk\|sl\|sm\|sn\|so\|sr\|ss\|st\|su\|sv\|sx\|
	sy\|sz\|tc\|td\|tf\|tg\|th\|tj\|tk\|tl\|tm\|tn\|to\|tp\|tr\|tt\|tv\|tw\|tz\|ua\|ug\|uk\|us\|uy\|uz\|va\|vc\|ve\|vg\|vi\|vn\|vu\|wf\|ws\|ye\|yt\|yu\|
	za\|zm\|zw)\b/?(?!@)))"""
	URL_PATTERN = re.compile(URL_PATTERN_STR, re.IGNORECASE)
	HASHTAG_PATTERN = re.compile(r'#\w*')
	MENTION_PATTERN = re.compile(r'@\w*')
	PUNCT_REPEAT_PATTERN = re.compile(r'([!?.]){2,}')
	ELONG_PATTERN = re.compile(r'\b(\S*?)(.)\2{2,}\b')
	WORD_PATTERN = re.compile(r'[^\w<>\s]')
	# Convert URL to <URL> so that GloVe will have a vector for it
	text = re.sub(URL_PATTERN, ' <URL>', text)
	# Add spaces around slashes
	text = re.sub(r"/", " / ", text)
	# Replace mentions with <USER>
	text = re.sub(MENTION_PATTERN, ' <USER> ', text)
	# Replace numbers with <NUMBER>
	text = re.sub(r"[-+]?[.\d][\d]+[:,.\d]", " <NUMBER> ", text)
	# Replace hashtags with <HASHTAG>
	text = re.sub(HASHTAG_PATTERN, ' <HASHTAG> ', text)
	#text = self.AND_PATTERN.sub('and', text) # & already in the Vocab of GloVe-twitter
	# Replace multiple punctuation marks with <REPEAT>
	text = re.sub(PUNCT_REPEAT_PATTERN, lambda match: f" {match.group(1)} <REPEAT> ", text)
	# Replace elongated words with <ELONG>
	text = re.sub(ELONG_PATTERN, lambda match: f" {match.group(1)}{match.group(2)} <ELONG> ", text)
	#text = emoji.replace_emoji(text, replace='') # some emojis are in the vocab so we do not remove them, the others will be OOVs
	text = text.strip()
	# Get only words
	text = re.sub(WORD_PATTERN, ' ', text)
	text = text.strip()
	# Convert stylized Unicode characters to plain text (removes bold text, etc.)
	text = ''.join(c for c in unicodedata.normalize('NFKD', text) if not unicodedata.combining(c))
	return text

	def lemmatize_text(text: str) -> str:
	"""
	Lemmatizes the input text using the WordNet lemmatizer.

	This method attempts to lemmatize each word in the input text. If the WordNet
	data is not available, it will download the necessary data and retry.

	Args:
	text (str): The input text to be lemmatized.

	Returns:
	str: The lemmatized text.
	"""
	lemmatizer = WordNetLemmatizer()
	downloaded = False
	while not downloaded:
	try:
	lemmatizer.lemmatize(text)
	downloaded = True
	except LookupError:
	print("Downloading WordNet...")
	nltk.download('wordnet')
	return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

	def predict(phrase: str, finetuned_model: str):
	phrase = preprocess_text(phrase)
	phrase = lemmatize_text(phrase)
	phrase = phrase.lower()

	# Get the tokenizer and model
	if 'xlm' in finetuned_model.lower():
	tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
	model = XLMRobertaForSequenceClassification.from_pretrained(finetuned_model)
	else:
	tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-hate')
	model = AutoModelForSequenceClassification.from_pretrained(finetuned_model)

	# Get the trainer
	trainer = Trainer(
	model=model,
	processing_class=tokenizer,
	)

	# Tokenize the phrase
	tokens = tokenizer(
	phrase,
	return_tensors="pt"
	)

	# Create the dataset
	phrase_dataset = Dataset.from_dict({
	"input_ids": tokens["input_ids"],
	"attention_mask": tokens["attention_mask"],
	})

	# Get the predictions
	pred = trainer.predict(phrase_dataset)

	# Check if it is sexist or not
	sexist = "Sexist" if pred.predictions.argmax() == 1 else "Not sexist"
	return sexist

	demo = gr.Interface(
	fn=predict,
	inputs=[
	"textbox",
	gr.Dropdown([
	"MatteoFasulo/twitter-roberta-base-hate_69",
	"MatteoFasulo/twitter-roberta-base-hate_1337",
	"MatteoFasulo/twitter-roberta-base-hate_42",
	"MatteoFasulo/xlm-roberta-base_69",
	"MatteoFasulo/xlm-roberta-base_1337",
	"MatteoFasulo/xlm-roberta-base_42",
	],
	label="Model",
	info="Choose the model to use for prediction.",
	)
	],
	outputs="text",
	)

	demo.launch()