Spaces:

mginoben
/

tagalog-profanity-classification

Runtime error

App Files Files Community

tagalog-profanity-classification / app.py

mginoben

Added match words list on output

3172d47 over 2 years ago

raw

history blame

6.07 kB

	import gradio as gr
	import requests
	import emoji
	import re
	import json
	from thefuzz import process, fuzz
	import numpy as np


	API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
	headers = {"Authorization": "Bearer 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"}

	profanities = ['bobo', 'bwiset','gago', 'kupal',
	'pakshet', 'pakyu', 'pucha',
	'punyeta', 'puta', 'pota', 'putangina', 'tanga', 'tangina',
	'tarantado', 'ulol']

	def read_text(filename, filetype='txt'):
	words = []

	if filetype == 'txt':
	with open(filename + '.txt') as file:
	words = [line.rstrip() for line in file]
	words = list(set(words))
	elif filetype == 'json':
	with open(filename + '.json') as json_file:
	words = json.load(json_file)

	return words


	contractions = read_text('contractions', 'json')
	lookup_words = read_text('lookup_words')
	obj_pronouns = read_text('obj_pronouns')
	profanities = read_text('profanities', 'json')


	def fuzzyLookup(tweet):
	lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
	obj_pronoun = ['ko', 'mo', 'nya', 'natin', 'namin', 'ninyo', 'nila', 'ka', 'nyo', 'ng']
	matches = dict()

	# Loop each word in tweet
	for word in tweet.split():
	scores = []
	matched_words = []
	# If word > 4 chars
	if len(word) >= 4:
	# Get fuzzy ratio
	for lookup_word in lookup_words:
	score = fuzz.ratio(word, lookup_word)
	if score >= 65:
	scores.append(score)
	matched_words.append(lookup_word)
	if len(scores) > 0:
	max_score_index = np.argmax(scores)
	if matched_words[max_score_index] in lookup_profanity:
	matches[word] = matched_words[max_score_index]


	for word, matched_profanity in matches.items():
	word_split = word.split(matched_profanity[-2:])
	for pronoun in obj_pronoun:
	if len(word_split) > 1:
	if pronoun == word_split[-1]:
	matches[word] = matched_profanity + ' ' + pronoun
	break

	# Replace each profanities by fuzzy lookup result
	for word, matched_profanity in matches.items():
	tweet = tweet.replace(word, matched_profanity)

	tweet_split = tweet.split()
	for profanity, prof_varations in profanities.items():
	for i, word in enumerate(tweet_split):
	if word in prof_varations:
	tweet_split[i] = profanity
	tweet = ' '.join(tweet_split)

	return tweet, json.dumps(matches)


	def preprocess(text):
	laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
	symbols = ['@', '#']

	# Lowercase
	text = text.lower()

	# Remove emojis
	text = emoji.replace_emoji(text, replace='')

	# Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
	text = re.sub(r'(.)\1{2,}', r'\1', text)

	# Split sentence into list of words
	row_split = text.split()

	for index, word in enumerate(row_split):

	# Remove words with symbols (e.g. @username, #hashtags)
	if any(x in word for x in symbols):
	row_split[index] = ''

	# Remove links
	if 'http' in word:
	row_split[index] = ''

	# Unify laugh texts format to 'haha'
	if any(x in word for x in laugh_texts):
	row_split[index] = 'haha'

	# Remove words with digits (4ever)
	if any(x.isdigit() for x in word):
	row_split[index] = ''

	# Combine list of words back to sentence
	combined_text = ' '.join(filter(None, row_split))

	# Check if output contains single word then return null
	if len(combined_text.split()) == 1:
	return combined_text

	# Filter needed characters
	combined_text = re.sub(r"[^A-Za-z ]+", '', combined_text)

	# Expand Contractions
	for i in contractions.items():
	combined_text = re.sub(rf"\b{i[0]}\b", i[1], combined_text)

	return combined_text


	def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()


	def predict(text):
	text= preprocess(text)
	text, matches = fuzzyLookup(text)
	output = query(text)

	if 'error' in output:
	return output['error'], 'Error occured. Try again later.', {"error": "error"}
	else:
	output = [tuple(i.values()) for i in output[0]]
	output = dict((x, y) for x, y in output)

	predicted_label = list(output.keys())[0]

	if predicted_label == 'Abusive':
	output_text = text
	for profanity in profanities:
	compiled = re.compile(re.escape(profanity), re.IGNORECASE)
	mask = ""
	for i in profanity:
	mask += "*" if i != " " else " "
	output_text = compiled.sub(mask, output_text)
	return output, output_text, matches
	else:
	return output, text, matches

	# TODO gag0 not appearing


	hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')


	demo = gr.Interface(
	fn=predict,

	inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],

	outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
	gr.components.Text(label='OUTPUT'),
	gr.components.JSON()],

	examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
	'Napakainit ngayong araw pakshet namaaan!!',
	'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
	'Bobo ka ba? napakadali lang nyan eh... 🤡',
	'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],

	allow_flagging="manual",
	flagging_callback=hf_writer,
	flagging_options=['Good bot', 'Bad bot']
	)

	demo.launch()