Spaces:
Runtime error
Runtime error
Fixed disappearing profanities
Browse files- app.py +9 -17
- lookup_words.txt +1 -0
app.py
CHANGED
@@ -31,9 +31,8 @@ obj_pronouns = read_text('obj_pronouns')
|
|
31 |
profanities = read_text('profanities', 'json')
|
32 |
|
33 |
|
34 |
-
def
|
35 |
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
|
36 |
-
obj_pronoun = ['ko', 'mo', 'nya', 'natin', 'namin', 'ninyo', 'nila', 'ka', 'nyo', 'ng']
|
37 |
matches = dict()
|
38 |
|
39 |
# Loop each word in tweet
|
@@ -58,7 +57,7 @@ def fuzzyLookup(tweet):
|
|
58 |
|
59 |
for word, matched_profanity in matches.items():
|
60 |
word_split = word.split(matched_profanity[-2:])
|
61 |
-
for pronoun in
|
62 |
if len(word_split) > 1:
|
63 |
if pronoun == word_split[-1]:
|
64 |
matches[word] = matched_profanity + ' ' + pronoun
|
@@ -68,13 +67,12 @@ def fuzzyLookup(tweet):
|
|
68 |
for word, matched_profanity in matches.items():
|
69 |
tweet = tweet.replace(word, matched_profanity)
|
70 |
|
71 |
-
tweet_split = tweet.split()
|
72 |
for profanity, prof_varations in profanities.items():
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
tweet = ' '.join(tweet_split)
|
77 |
|
|
|
78 |
return tweet, matches
|
79 |
|
80 |
|
@@ -108,10 +106,6 @@ def preprocess(tweet):
|
|
108 |
if any(x in word for x in laugh_texts):
|
109 |
row_split[index] = 'haha'
|
110 |
|
111 |
-
# Remove words with digits (4ever)
|
112 |
-
if any(x.isdigit() for x in word):
|
113 |
-
row_split[index] = ''
|
114 |
-
|
115 |
# Combine list of words back to sentence
|
116 |
combined_text = ' '.join(filter(None, row_split))
|
117 |
|
@@ -136,9 +130,8 @@ def query(payload):
|
|
136 |
|
137 |
def predict(tweet):
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
output = query(processed_text)
|
142 |
|
143 |
if 'error' in output:
|
144 |
return output['error'], 'Error occured. Try again later.', {"error": "error"}
|
@@ -149,14 +142,13 @@ def predict(tweet):
|
|
149 |
|
150 |
if predicted_label == 'Abusive':
|
151 |
for base_word, _ in matches.items():
|
|
|
152 |
tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
|
153 |
|
154 |
return output, tweet, json.dumps(matches)
|
155 |
else:
|
156 |
return output, tweet, json.dumps(matches)
|
157 |
|
158 |
-
|
159 |
-
|
160 |
hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
|
161 |
|
162 |
|
|
|
31 |
profanities = read_text('profanities', 'json')
|
32 |
|
33 |
|
34 |
+
def fuzzy_lookup(tweet):
|
35 |
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
|
|
|
36 |
matches = dict()
|
37 |
|
38 |
# Loop each word in tweet
|
|
|
57 |
|
58 |
for word, matched_profanity in matches.items():
|
59 |
word_split = word.split(matched_profanity[-2:])
|
60 |
+
for pronoun in obj_pronouns:
|
61 |
if len(word_split) > 1:
|
62 |
if pronoun == word_split[-1]:
|
63 |
matches[word] = matched_profanity + ' ' + pronoun
|
|
|
67 |
for word, matched_profanity in matches.items():
|
68 |
tweet = tweet.replace(word, matched_profanity)
|
69 |
|
|
|
70 |
for profanity, prof_varations in profanities.items():
|
71 |
+
if len(prof_varations) > 0:
|
72 |
+
for prof_variant in prof_varations:
|
73 |
+
tweet = tweet.replace(prof_variant, profanity)
|
|
|
74 |
|
75 |
+
print('Fuzzy Returns:', tweet)
|
76 |
return tweet, matches
|
77 |
|
78 |
|
|
|
106 |
if any(x in word for x in laugh_texts):
|
107 |
row_split[index] = 'haha'
|
108 |
|
|
|
|
|
|
|
|
|
109 |
# Combine list of words back to sentence
|
110 |
combined_text = ' '.join(filter(None, row_split))
|
111 |
|
|
|
130 |
|
131 |
def predict(tweet):
|
132 |
|
133 |
+
fuzzy_text, matches = fuzzy_lookup(tweet)
|
134 |
+
output = query(preprocess(fuzzy_text))
|
|
|
135 |
|
136 |
if 'error' in output:
|
137 |
return output['error'], 'Error occured. Try again later.', {"error": "error"}
|
|
|
142 |
|
143 |
if predicted_label == 'Abusive':
|
144 |
for base_word, _ in matches.items():
|
145 |
+
|
146 |
tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
|
147 |
|
148 |
return output, tweet, json.dumps(matches)
|
149 |
else:
|
150 |
return output, tweet, json.dumps(matches)
|
151 |
|
|
|
|
|
152 |
hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
|
153 |
|
154 |
|
lookup_words.txt
CHANGED
@@ -152,4 +152,5 @@ kang
|
|
152 |
bubuka
|
153 |
buka
|
154 |
talaga
|
|
|
155 |
g@g0
|
|
|
152 |
bubuka
|
153 |
buka
|
154 |
talaga
|
155 |
+
tuloy
|
156 |
g@g0
|