Spaces:
Runtime error
Runtime error
Fixed word lookup including emojis
Browse files
app.py
CHANGED
@@ -5,16 +5,12 @@ import re
|
|
5 |
import json
|
6 |
from thefuzz import process, fuzz
|
7 |
import numpy as np
|
|
|
8 |
|
9 |
|
10 |
API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
|
11 |
headers = {"Authorization": "Bearer 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"}
|
12 |
|
13 |
-
profanities = ['bobo', 'bwiset','gago', 'kupal',
|
14 |
-
'pakshet', 'pakyu', 'pucha',
|
15 |
-
'punyeta', 'puta', 'pota', 'putangina', 'tanga', 'tangina',
|
16 |
-
'tarantado', 'ulol']
|
17 |
-
|
18 |
def read_text(filename, filetype='txt'):
|
19 |
words = []
|
20 |
|
@@ -42,6 +38,8 @@ def fuzzyLookup(tweet):
|
|
42 |
|
43 |
# Loop each word in tweet
|
44 |
for word in tweet.split():
|
|
|
|
|
45 |
scores = []
|
46 |
matched_words = []
|
47 |
# If word > 4 chars
|
@@ -77,24 +75,24 @@ def fuzzyLookup(tweet):
|
|
77 |
tweet_split[i] = profanity
|
78 |
tweet = ' '.join(tweet_split)
|
79 |
|
80 |
-
return tweet,
|
81 |
|
82 |
|
83 |
-
def preprocess(
|
84 |
laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
|
85 |
symbols = ['@', '#']
|
86 |
|
87 |
# Lowercase
|
88 |
-
|
89 |
|
90 |
# Remove emojis
|
91 |
-
|
92 |
|
93 |
# Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
|
94 |
-
|
95 |
|
96 |
# Split sentence into list of words
|
97 |
-
row_split =
|
98 |
|
99 |
for index, word in enumerate(row_split):
|
100 |
|
@@ -136,32 +134,27 @@ def query(payload):
|
|
136 |
return response.json()
|
137 |
|
138 |
|
139 |
-
def predict(
|
140 |
-
|
141 |
-
|
142 |
-
|
|
|
143 |
|
144 |
if 'error' in output:
|
145 |
return output['error'], 'Error occured. Try again later.', {"error": "error"}
|
146 |
else:
|
147 |
output = [tuple(i.values()) for i in output[0]]
|
148 |
output = dict((x, y) for x, y in output)
|
149 |
-
|
150 |
predicted_label = list(output.keys())[0]
|
151 |
|
152 |
if predicted_label == 'Abusive':
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
for i in profanity:
|
158 |
-
mask += "*" if i != " " else " "
|
159 |
-
output_text = compiled.sub(mask, output_text)
|
160 |
-
return output, output_text, matches
|
161 |
else:
|
162 |
-
return output,
|
163 |
-
|
164 |
-
# TODO gag0 not appearing
|
165 |
|
166 |
|
167 |
hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
|
@@ -174,7 +167,7 @@ demo = gr.Interface(
|
|
174 |
|
175 |
outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
|
176 |
gr.components.Text(label='OUTPUT'),
|
177 |
-
gr.components.JSON()],
|
178 |
|
179 |
examples=['Tangina mo naman sobrang yabang mo gago!!π π€ @davidrafael',
|
180 |
'Napakainit ngayong araw pakshet namaaan!!',
|
|
|
5 |
import json
|
6 |
from thefuzz import process, fuzz
|
7 |
import numpy as np
|
8 |
+
import re
|
9 |
|
10 |
|
11 |
API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
|
12 |
headers = {"Authorization": "Bearer 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"}
|
13 |
|
|
|
|
|
|
|
|
|
|
|
14 |
def read_text(filename, filetype='txt'):
|
15 |
words = []
|
16 |
|
|
|
38 |
|
39 |
# Loop each word in tweet
|
40 |
for word in tweet.split():
|
41 |
+
# Only get digits and letters
|
42 |
+
word = re.sub("[^a-zA-Z0-9@]", "", word)
|
43 |
scores = []
|
44 |
matched_words = []
|
45 |
# If word > 4 chars
|
|
|
75 |
tweet_split[i] = profanity
|
76 |
tweet = ' '.join(tweet_split)
|
77 |
|
78 |
+
return tweet, matches
|
79 |
|
80 |
|
81 |
+
def preprocess(tweet):
|
82 |
laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
|
83 |
symbols = ['@', '#']
|
84 |
|
85 |
# Lowercase
|
86 |
+
tweet = tweet.lower()
|
87 |
|
88 |
# Remove emojis
|
89 |
+
tweet = emoji.replace_emoji(tweet, replace='')
|
90 |
|
91 |
# Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
|
92 |
+
tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)
|
93 |
|
94 |
# Split sentence into list of words
|
95 |
+
row_split = tweet.split()
|
96 |
|
97 |
for index, word in enumerate(row_split):
|
98 |
|
|
|
134 |
return response.json()
|
135 |
|
136 |
|
137 |
+
def predict(tweet):
|
138 |
+
|
139 |
+
fuzz_text, matches = fuzzyLookup(tweet)
|
140 |
+
processed_text = preprocess(fuzz_text)
|
141 |
+
output = query(processed_text)
|
142 |
|
143 |
if 'error' in output:
|
144 |
return output['error'], 'Error occured. Try again later.', {"error": "error"}
|
145 |
else:
|
146 |
output = [tuple(i.values()) for i in output[0]]
|
147 |
output = dict((x, y) for x, y in output)
|
|
|
148 |
predicted_label = list(output.keys())[0]
|
149 |
|
150 |
if predicted_label == 'Abusive':
|
151 |
+
for base_word, _ in matches.items():
|
152 |
+
tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
|
153 |
+
|
154 |
+
return output, tweet, json.dumps(matches)
|
|
|
|
|
|
|
|
|
155 |
else:
|
156 |
+
return output, tweet, json.dumps(matches)
|
157 |
+
|
|
|
158 |
|
159 |
|
160 |
hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
|
|
|
167 |
|
168 |
outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
|
169 |
gr.components.Text(label='OUTPUT'),
|
170 |
+
gr.components.JSON(label='DETECTED PROFANITIES')],
|
171 |
|
172 |
examples=['Tangina mo naman sobrang yabang mo gago!!π π€ @davidrafael',
|
173 |
'Napakainit ngayong araw pakshet namaaan!!',
|