Spaces:
Runtime error
Runtime error
Added match words list on output
Browse files
app.py
CHANGED
@@ -2,79 +2,99 @@ import gradio as gr
|
|
2 |
import requests
|
3 |
import emoji
|
4 |
import re
|
|
|
|
|
|
|
|
|
5 |
|
6 |
API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
|
7 |
headers = {"Authorization": "Bearer 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"}
|
8 |
|
9 |
-
profanities = ['bobo', '
|
10 |
-
'pakshet', 'pakyu', 'pucha',
|
11 |
-
'punyeta', '
|
12 |
-
'
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
'
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
'
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
'
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
|
65 |
symbols = ['@', '#']
|
66 |
|
67 |
# Lowercase
|
68 |
-
|
69 |
|
70 |
# Remove emojis
|
71 |
-
|
72 |
|
73 |
# Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
|
74 |
-
|
75 |
|
76 |
# Split sentence into list of words
|
77 |
-
row_split =
|
78 |
|
79 |
for index, word in enumerate(row_split):
|
80 |
|
@@ -117,11 +137,12 @@ def query(payload):
|
|
117 |
|
118 |
|
119 |
def predict(text):
|
120 |
-
|
121 |
-
|
|
|
122 |
|
123 |
if 'error' in output:
|
124 |
-
return output['error'], 'Error occured. Try again later.'
|
125 |
else:
|
126 |
output = [tuple(i.values()) for i in output[0]]
|
127 |
output = dict((x, y) for x, y in output)
|
@@ -136,11 +157,11 @@ def predict(text):
|
|
136 |
for i in profanity:
|
137 |
mask += "*" if i != " " else " "
|
138 |
output_text = compiled.sub(mask, output_text)
|
139 |
-
return output, output_text
|
140 |
else:
|
141 |
-
return output, text
|
142 |
-
|
143 |
|
|
|
144 |
|
145 |
|
146 |
hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
|
@@ -152,7 +173,8 @@ demo = gr.Interface(
|
|
152 |
inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
|
153 |
|
154 |
outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
|
155 |
-
gr.components.Text(label='OUTPUT')
|
|
|
156 |
|
157 |
examples=['Tangina mo naman sobrang yabang mo gago!!π π€ @davidrafael',
|
158 |
'Napakainit ngayong araw pakshet namaaan!!',
|
|
|
2 |
import requests
|
3 |
import emoji
|
4 |
import re
|
5 |
+
import json
|
6 |
+
from thefuzz import process, fuzz
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
|
10 |
API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
|
11 |
headers = {"Authorization": "Bearer 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"}
|
12 |
|
13 |
+
profanities = ['bobo', 'bwiset','gago', 'kupal',
|
14 |
+
'pakshet', 'pakyu', 'pucha',
|
15 |
+
'punyeta', 'puta', 'pota', 'putangina', 'tanga', 'tangina',
|
16 |
+
'tarantado', 'ulol']
|
17 |
+
|
18 |
+
def read_text(filename, filetype='txt'):
|
19 |
+
words = []
|
20 |
+
|
21 |
+
if filetype == 'txt':
|
22 |
+
with open(filename + '.txt') as file:
|
23 |
+
words = [line.rstrip() for line in file]
|
24 |
+
words = list(set(words))
|
25 |
+
elif filetype == 'json':
|
26 |
+
with open(filename + '.json') as json_file:
|
27 |
+
words = json.load(json_file)
|
28 |
+
|
29 |
+
return words
|
30 |
+
|
31 |
+
|
32 |
+
contractions = read_text('contractions', 'json')
|
33 |
+
lookup_words = read_text('lookup_words')
|
34 |
+
obj_pronouns = read_text('obj_pronouns')
|
35 |
+
profanities = read_text('profanities', 'json')
|
36 |
+
|
37 |
+
|
38 |
+
def fuzzyLookup(tweet):
|
39 |
+
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
|
40 |
+
obj_pronoun = ['ko', 'mo', 'nya', 'natin', 'namin', 'ninyo', 'nila', 'ka', 'nyo', 'ng']
|
41 |
+
matches = dict()
|
42 |
+
|
43 |
+
# Loop each word in tweet
|
44 |
+
for word in tweet.split():
|
45 |
+
scores = []
|
46 |
+
matched_words = []
|
47 |
+
# If word > 4 chars
|
48 |
+
if len(word) >= 4:
|
49 |
+
# Get fuzzy ratio
|
50 |
+
for lookup_word in lookup_words:
|
51 |
+
score = fuzz.ratio(word, lookup_word)
|
52 |
+
if score >= 65:
|
53 |
+
scores.append(score)
|
54 |
+
matched_words.append(lookup_word)
|
55 |
+
if len(scores) > 0:
|
56 |
+
max_score_index = np.argmax(scores)
|
57 |
+
if matched_words[max_score_index] in lookup_profanity:
|
58 |
+
matches[word] = matched_words[max_score_index]
|
59 |
+
|
60 |
+
|
61 |
+
for word, matched_profanity in matches.items():
|
62 |
+
word_split = word.split(matched_profanity[-2:])
|
63 |
+
for pronoun in obj_pronoun:
|
64 |
+
if len(word_split) > 1:
|
65 |
+
if pronoun == word_split[-1]:
|
66 |
+
matches[word] = matched_profanity + ' ' + pronoun
|
67 |
+
break
|
68 |
+
|
69 |
+
# Replace each profanities by fuzzy lookup result
|
70 |
+
for word, matched_profanity in matches.items():
|
71 |
+
tweet = tweet.replace(word, matched_profanity)
|
72 |
+
|
73 |
+
tweet_split = tweet.split()
|
74 |
+
for profanity, prof_varations in profanities.items():
|
75 |
+
for i, word in enumerate(tweet_split):
|
76 |
+
if word in prof_varations:
|
77 |
+
tweet_split[i] = profanity
|
78 |
+
tweet = ' '.join(tweet_split)
|
79 |
+
|
80 |
+
return tweet, json.dumps(matches)
|
81 |
+
|
82 |
+
|
83 |
+
def preprocess(text):
|
84 |
laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
|
85 |
symbols = ['@', '#']
|
86 |
|
87 |
# Lowercase
|
88 |
+
text = text.lower()
|
89 |
|
90 |
# Remove emojis
|
91 |
+
text = emoji.replace_emoji(text, replace='')
|
92 |
|
93 |
# Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
|
94 |
+
text = re.sub(r'(.)\1{2,}', r'\1', text)
|
95 |
|
96 |
# Split sentence into list of words
|
97 |
+
row_split = text.split()
|
98 |
|
99 |
for index, word in enumerate(row_split):
|
100 |
|
|
|
137 |
|
138 |
|
139 |
def predict(text):
|
140 |
+
text= preprocess(text)
|
141 |
+
text, matches = fuzzyLookup(text)
|
142 |
+
output = query(text)
|
143 |
|
144 |
if 'error' in output:
|
145 |
+
return output['error'], 'Error occured. Try again later.', {"error": "error"}
|
146 |
else:
|
147 |
output = [tuple(i.values()) for i in output[0]]
|
148 |
output = dict((x, y) for x, y in output)
|
|
|
157 |
for i in profanity:
|
158 |
mask += "*" if i != " " else " "
|
159 |
output_text = compiled.sub(mask, output_text)
|
160 |
+
return output, output_text, matches
|
161 |
else:
|
162 |
+
return output, text, matches
|
|
|
163 |
|
164 |
+
# TODO gag0 not appearing
|
165 |
|
166 |
|
167 |
hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
|
|
|
173 |
inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
|
174 |
|
175 |
outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
|
176 |
+
gr.components.Text(label='OUTPUT'),
|
177 |
+
gr.components.JSON()],
|
178 |
|
179 |
examples=['Tangina mo naman sobrang yabang mo gago!!π π€ @davidrafael',
|
180 |
'Napakainit ngayong araw pakshet namaaan!!',
|