Spaces:
Runtime error
Runtime error
import gradio as gr | |
import requests | |
import emoji | |
import re | |
import json | |
from thefuzz import process, fuzz | |
import numpy as np | |
API_URL = "https://api-inference.huggingface.co/models/Dabid/test2" | |
headers = {"Authorization": "Bearer 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"} | |
profanities = ['bobo', 'bwiset','gago', 'kupal', | |
'pakshet', 'pakyu', 'pucha', | |
'punyeta', 'puta', 'pota', 'putangina', 'tanga', 'tangina', | |
'tarantado', 'ulol'] | |
def read_text(filename, filetype='txt'): | |
words = [] | |
if filetype == 'txt': | |
with open(filename + '.txt') as file: | |
words = [line.rstrip() for line in file] | |
words = list(set(words)) | |
elif filetype == 'json': | |
with open(filename + '.json') as json_file: | |
words = json.load(json_file) | |
return words | |
contractions = read_text('contractions', 'json') | |
lookup_words = read_text('lookup_words') | |
obj_pronouns = read_text('obj_pronouns') | |
profanities = read_text('profanities', 'json') | |
def fuzzyLookup(tweet): | |
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())]) | |
obj_pronoun = ['ko', 'mo', 'nya', 'natin', 'namin', 'ninyo', 'nila', 'ka', 'nyo', 'ng'] | |
matches = dict() | |
# Loop each word in tweet | |
for word in tweet.split(): | |
scores = [] | |
matched_words = [] | |
# If word > 4 chars | |
if len(word) >= 4: | |
# Get fuzzy ratio | |
for lookup_word in lookup_words: | |
score = fuzz.ratio(word, lookup_word) | |
if score >= 65: | |
scores.append(score) | |
matched_words.append(lookup_word) | |
if len(scores) > 0: | |
max_score_index = np.argmax(scores) | |
if matched_words[max_score_index] in lookup_profanity: | |
matches[word] = matched_words[max_score_index] | |
for word, matched_profanity in matches.items(): | |
word_split = word.split(matched_profanity[-2:]) | |
for pronoun in obj_pronoun: | |
if len(word_split) > 1: | |
if pronoun == word_split[-1]: | |
matches[word] = matched_profanity + ' ' + pronoun | |
break | |
# Replace each profanities by fuzzy lookup result | |
for word, matched_profanity in matches.items(): | |
tweet = tweet.replace(word, matched_profanity) | |
tweet_split = tweet.split() | |
for profanity, prof_varations in profanities.items(): | |
for i, word in enumerate(tweet_split): | |
if word in prof_varations: | |
tweet_split[i] = profanity | |
tweet = ' '.join(tweet_split) | |
return tweet, json.dumps(matches) | |
def preprocess(text): | |
laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha'] | |
symbols = ['@', '#'] | |
# Lowercase | |
text = text.lower() | |
# Remove emojis | |
text = emoji.replace_emoji(text, replace='') | |
# Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter) | |
text = re.sub(r'(.)\1{2,}', r'\1', text) | |
# Split sentence into list of words | |
row_split = text.split() | |
for index, word in enumerate(row_split): | |
# Remove words with symbols (e.g. @username, #hashtags) | |
if any(x in word for x in symbols): | |
row_split[index] = '' | |
# Remove links | |
if 'http' in word: | |
row_split[index] = '' | |
# Unify laugh texts format to 'haha' | |
if any(x in word for x in laugh_texts): | |
row_split[index] = 'haha' | |
# Remove words with digits (4ever) | |
if any(x.isdigit() for x in word): | |
row_split[index] = '' | |
# Combine list of words back to sentence | |
combined_text = ' '.join(filter(None, row_split)) | |
# Check if output contains single word then return null | |
if len(combined_text.split()) == 1: | |
return combined_text | |
# Filter needed characters | |
combined_text = re.sub(r"[^A-Za-z ]+", '', combined_text) | |
# Expand Contractions | |
for i in contractions.items(): | |
combined_text = re.sub(rf"\b{i[0]}\b", i[1], combined_text) | |
return combined_text | |
def query(payload): | |
response = requests.post(API_URL, headers=headers, json=payload) | |
return response.json() | |
def predict(text): | |
text= preprocess(text) | |
text, matches = fuzzyLookup(text) | |
output = query(text) | |
if 'error' in output: | |
return output['error'], 'Error occured. Try again later.', {"error": "error"} | |
else: | |
output = [tuple(i.values()) for i in output[0]] | |
output = dict((x, y) for x, y in output) | |
predicted_label = list(output.keys())[0] | |
if predicted_label == 'Abusive': | |
output_text = text | |
for profanity in profanities: | |
compiled = re.compile(re.escape(profanity), re.IGNORECASE) | |
mask = "" | |
for i in profanity: | |
mask += "*" if i != " " else " " | |
output_text = compiled.sub(mask, output_text) | |
return output, output_text, matches | |
else: | |
return output, text, matches | |
# TODO gag0 not appearing | |
hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks') | |
demo = gr.Interface( | |
fn=predict, | |
inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')], | |
outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"), | |
gr.components.Text(label='OUTPUT'), | |
gr.components.JSON()], | |
examples=['Tangina mo naman sobrang yabang mo gago!!๐ ๐ค @davidrafael', | |
'Napakainit ngayong araw pakshet namaaan!!', | |
'Napakabagal naman ng wifi tangina #PLDC #HelloDITO', | |
'Bobo ka ba? napakadali lang nyan eh... ๐คก', | |
'Uy gago laptrip yung nangyare samen kanina HAHAHA๐๐'], | |
allow_flagging="manual", | |
flagging_callback=hf_writer, | |
flagging_options=['Good bot', 'Bad bot'] | |
) | |
demo.launch() | |