Spaces:

knowhate
/

portuguese-counter-hate-speech-detection

Sleeping

App Files Files Community

portuguese-counter-hate-speech-detection / app.py

pfialho

Update app.py

f033d32 verified 5 months ago

raw

history blame contribute delete

6.41 kB

	import gradio as gr
	from transformers import TFBertModel, TFXLMRobertaModel
	import numpy as np
	import tensorflow as tf
	from transformers import AutoTokenizer
	from huggingface_hub import from_pretrained_keras
	import re


	app_title = "Portuguese Counter Hate-Speech Detection"

	app_description = """
	This prototype from the kNOwHATE project aims to classify a Portuguese target sentence as either hate speech, counter hate speech or neutral, considering another sentence as context.

	We collected 24,739 YouTube comments and 29,846 tweets, annotated by experts, and trained our prototype on this data.

	We invite you to try it out. You can just enter a pair of sentences below, one as target and another as context, and submit it to see if the target is either hate speech, counter hate speech or neutral, relative to the context.

	For more, visit our [website](https://knowhate.eu) and [Hugging Face page](https://huggingface.co/knowhate).
	"""

	def_model = 'knowhate/counterhate-youtube-bertimbau'
	def_model2 = 'knowhate/counterhate-twitter-bertbasemultilingualcased-cleantxt'

	model_list = [
	def_model,
	"knowhate/counterhate-youtube-xlmrobertabase",
	"knowhate/counterhate-youtube-bertbasemultilingualcased",
	"knowhate/counterhate-twitter-bertimbau",
	"knowhate/counterhate-twitter-bertimbau-cleantxt",
	"knowhate/counterhate-twitter-xlmrobertabase",
	"knowhate/counterhate-twitter-xlmrobertabase-cleantxt",
	"knowhate/counterhate-twitter-bertbasemultilingualcased",
	def_model2
	]

	kw_to_hf = {"knowhate/counterhate-twitter-bertimbau": "neuralmind/bert-base-portuguese-cased",
	"knowhate/counterhate-twitter-bertimbau-cleantxt": "neuralmind/bert-base-portuguese-cased",
	"knowhate/counterhate-twitter-xlmrobertabase": "xlm-roberta-base",
	"knowhate/counterhate-twitter-xlmrobertabase-cleantxt": "xlm-roberta-base",
	"knowhate/counterhate-twitter-bertbasemultilingualcased": "bert-base-multilingual-cased",
	"knowhate/counterhate-twitter-bertbasemultilingualcased-cleantxt": "bert-base-multilingual-cased",
	"knowhate/counterhate-youtube-bertimbau": "neuralmind/bert-base-portuguese-cased",
	"knowhate/counterhate-youtube-xlmrobertabase": "xlm-roberta-base",
	"knowhate/counterhate-youtube-bertbasemultilingualcased": "bert-base-multilingual-cased"
	}
	# "knowhate/counterhate-youtube-hateberttuga": "knowhate/hateberttuga",
	# "knowhate/counterhate-twitter-hateberttuga": "knowhate/hateberttuga"

	# 1 0 2
	app_examples = [
	["Totalmente de acordo mano ...quando somos nós já nao e racismo...Fdse isto e uma vergonha ..",
	"Mimimi... Vocês são preconceituosos e não tem vergonha na cara!",
	def_model],
	["\"Não acredites em tudo o que lês na Internet\" - Abraham Lincoln",
	"A Internet foi desenvolvida entre os anos 1973-1989.",
	def_model],
	["Então o Marcelo foi ao Qatar para 'falar de direitos humanos', mas não foi a Odemira?",
	"esse retardado mental, foi a praia do katar, la tem a agua mais kentinha.",
	def_model],
	["Essa gente tem é de deixar de ser apaparicada pelo Estado e começar a cumprir os seus deveres como cidadãos.",
	"Nepia o que faz com que as pessoas generalizem é o ódio intrínseco que têm contra uma etnia, ng é responsável pela sua xenofobia",
	def_model2],
	["Nem vou comentar o hate e misoginia que tenho visto aqui no tt em relação à Anitta",
	"E xenofobia também. Tugas no seu melhor",
	def_model2],
	["A Festa tá no Climax, chama o zuca pra Dançar.",
	"Já reparaste no contador da luz? Vai trabalhar malandro",
	def_model2]

	]


	def remove_emojis(data):
	if '@' in data:
	data = re.sub(r"(?<=^\|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z0-9-_]+[A-Za-z0-9-_]+)", "", data).strip()

	if 'https' in data:
	data = re.sub(
	r"(?i)\b(?:[a-z][\w.+-]+:(?:/{1,3}\|[?+]?[a-z0-9%]))(?:[^\s()<>]+\|\(([^\s()<>]+\|(\([^\s()<>]+\)))\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|[^\s\x60!()\[\]{};:'\".,<>?«»“”‘’])",
	"", data).strip()

	emoj = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002500-\U00002BEF" # chinese char
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	u"\U0001f926-\U0001f937"
	u"\U00010000-\U0010ffff"
	u"\u2640-\u2642"
	u"\u2600-\u2B55"
	u"\u200d"
	u"\u23cf"
	u"\u23e9"
	u"\u231a"
	u"\ufe0f" # dingbats
	u"\u3030"
	"]+", re.UNICODE)
	return re.sub(emoj, '', data)


	def predict(text, target, chosen_model):
	# model1 = tf.keras.models.load_model(chosen_model, custom_objects={"TFBertModel": TFBertModel})

	print(chosen_model)

	if '-cleantxt' in chosen_model:
	text = remove_emojis(text)
	target = remove_emojis(target)

	print(text)
	print(target)

	model1 = from_pretrained_keras(chosen_model)

	checkpoint = kw_to_hf[chosen_model] # "neuralmind/bert-base-portuguese-cased"
	if '/' in checkpoint:
	tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True, model_max_length=512)
	else:
	tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)

	tokpair = tokenizer(text, target, truncation=True, padding='max_length', return_tensors='tf', return_token_type_ids=False)

	outp = model1.signatures["serving_default"](**tokpair)

	proto_tensor = tf.make_tensor_proto(outp['outp'])
	allscores = tf.make_ndarray(proto_tensor)[0]

	scores_dict = {
	'Neutral': allscores[0],
	'Counter Speech': allscores[1],
	'Hate Speech': allscores[2]
	}

	return scores_dict

	inputs = [
	gr.Textbox(label="Context", value= app_examples[0][0]),
	gr.Textbox(label="Target", value= app_examples[0][1]),
	gr.Dropdown(label="Model", choices=model_list, value=model_list[0])
	]

	outputs = [
	gr.Label(label="Result"),
	]

	gr.Interface(fn=predict, inputs=inputs, outputs=outputs, title=app_title,
	description=app_description, examples=app_examples, theme=gr.themes.Base(primary_hue="red")).launch()