Spaces:
Runtime error
Runtime error
Profanity with hashtag detection
Browse files- app.py +64 -40
- contractions.json +4 -1
app.py
CHANGED
@@ -7,8 +7,7 @@ from thefuzz import process, fuzz
|
|
7 |
import numpy as np
|
8 |
import re
|
9 |
import nltk
|
10 |
-
|
11 |
-
from nltk.corpus import words
|
12 |
|
13 |
|
14 |
API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
|
@@ -36,58 +35,86 @@ def read_text(filename, filetype='txt'):
|
|
36 |
contractions = read_text('contractions', 'json')
|
37 |
similar_words = read_text('similar_words')
|
38 |
addon_words = read_text('addon_words')
|
39 |
-
|
40 |
-
lookup_profanity = np.concatenate([np.hstack(list(
|
41 |
-
lookup_words = list(set(similar_words).union(set(lookup_profanity
|
42 |
-
eng_words = list(
|
43 |
punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')
|
44 |
|
45 |
-
# TODO check eng words that are tagalog profanities
|
46 |
-
|
47 |
def fuzzy_lookup(tweet):
|
48 |
|
49 |
matched_profanity = dict()
|
50 |
|
51 |
-
# tweet = punctuations.sub('', tweet).lower()
|
52 |
-
|
53 |
for word in tweet.split():
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
word = punctuations.sub('', word).lower()
|
|
|
|
|
56 |
base_word = word
|
|
|
|
|
57 |
word = re.sub(r'(.)\1{2,}', r'\1', word)
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
62 |
for addon in addon_words:
|
63 |
if word.startswith(addon):
|
64 |
word = word[len(addon):]
|
65 |
if word.endswith(addon):
|
66 |
word = word[:-len(addon)]
|
67 |
|
68 |
-
if word
|
69 |
-
|
70 |
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
for lookup_word in lookup_words:
|
77 |
-
score = fuzz.ratio(word, lookup_word)
|
78 |
-
if score >= 70:
|
79 |
-
scores.append(score)
|
80 |
-
matched_words.append(lookup_word)
|
81 |
-
if len(scores) > 0:
|
82 |
-
max_score_index = np.argmax(scores)
|
83 |
-
if matched_words[max_score_index] in lookup_profanity:
|
84 |
-
for base_profanity, profanity_variations in profanities.items():
|
85 |
-
if matched_words[max_score_index] == base_profanity:
|
86 |
-
matched_profanity[base_word] = base_profanity
|
87 |
-
break
|
88 |
-
if matched_words[max_score_index] in profanity_variations:
|
89 |
-
matched_profanity[base_word] = base_profanity
|
90 |
-
break
|
91 |
|
92 |
return matched_profanity
|
93 |
|
@@ -108,11 +135,6 @@ def preprocess(tweet, profanities):
|
|
108 |
|
109 |
for index, word in enumerate(row_split):
|
110 |
|
111 |
-
# Seperate pronouns
|
112 |
-
for addon in addon_words:
|
113 |
-
if word.endswith(addon):
|
114 |
-
row_split[index] = word[:-len(addon)] + " " + addon
|
115 |
-
|
116 |
# Remove links
|
117 |
if 'http' in word:
|
118 |
row_split[index] = ''
|
@@ -150,9 +172,11 @@ def predict(tweet):
|
|
150 |
print(prediction)
|
151 |
error_message = prediction['error']
|
152 |
return error_message, {}
|
|
|
153 |
prediction = prediction[0][0]["label"]
|
154 |
|
155 |
print("\nTWEET:", tweet)
|
|
|
156 |
print("DETECTED PROFANITY:", list(profanities.keys()))
|
157 |
print("LABEL:", prediction, "\n")
|
158 |
|
|
|
7 |
import numpy as np
|
8 |
import re
|
9 |
import nltk
|
10 |
+
from english_words import get_english_words_set
|
|
|
11 |
|
12 |
|
13 |
API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
|
|
|
35 |
contractions = read_text('contractions', 'json')
|
36 |
similar_words = read_text('similar_words')
|
37 |
addon_words = read_text('addon_words')
|
38 |
+
profanities_dict = read_text('profanities', 'json')
|
39 |
+
lookup_profanity = np.concatenate([np.hstack(list(profanities_dict.values())), list(profanities_dict.keys())]).tolist()
|
40 |
+
lookup_words = list(set(similar_words).union(set(lookup_profanity)))
|
41 |
+
eng_words = list(get_english_words_set(['web2'], lower=True) - set(lookup_profanity))
|
42 |
punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')
|
43 |
|
|
|
|
|
44 |
def fuzzy_lookup(tweet):
|
45 |
|
46 |
matched_profanity = dict()
|
47 |
|
|
|
|
|
48 |
for word in tweet.split():
|
49 |
|
50 |
+
if word in eng_words:
|
51 |
+
continue
|
52 |
+
|
53 |
+
scores = []
|
54 |
+
matched_words = []
|
55 |
+
matched_word = None
|
56 |
+
|
57 |
+
# Remove trailing punctuations except # and @
|
58 |
word = punctuations.sub('', word).lower()
|
59 |
+
|
60 |
+
# Save base word
|
61 |
base_word = word
|
62 |
+
|
63 |
+
# Shortent elongated word
|
64 |
word = re.sub(r'(.)\1{2,}', r'\1', word)
|
65 |
+
|
66 |
+
# Remove # and @
|
67 |
+
if word.startswith("#") or word.startswith("@"):
|
68 |
+
word = word[1:]
|
69 |
+
|
70 |
+
# Remove trailing words (mo, ka, pinaka)
|
71 |
for addon in addon_words:
|
72 |
if word.startswith(addon):
|
73 |
word = word[len(addon):]
|
74 |
if word.endswith(addon):
|
75 |
word = word[:-len(addon)]
|
76 |
|
77 |
+
if len(word) < 4:
|
78 |
+
continue
|
79 |
|
80 |
+
# Get fuzzy ratio
|
81 |
+
for lookup_word in lookup_words:
|
82 |
+
|
83 |
+
score = fuzz.ratio(word, lookup_word)
|
84 |
+
|
85 |
+
# Threshold
|
86 |
+
if score >= 70:
|
87 |
+
scores.append(score)
|
88 |
+
matched_words.append(lookup_word)
|
89 |
+
|
90 |
+
if len(scores) == 0:
|
91 |
+
continue
|
92 |
+
|
93 |
+
if len(set(scores)) == 1:
|
94 |
+
for matched_word in matched_words:
|
95 |
+
if matched_word in lookup_profanity:
|
96 |
+
matched_word = matched_word
|
97 |
+
break
|
98 |
+
else:
|
99 |
+
# Get matched word with max score
|
100 |
+
max_score_index = np.argmax(scores)
|
101 |
+
matched_word = matched_words[max_score_index]
|
102 |
+
|
103 |
+
if matched_word not in lookup_profanity:
|
104 |
+
continue
|
105 |
+
|
106 |
+
for base_profanity, profanity_variations in profanities_dict.items():
|
107 |
+
|
108 |
+
if matched_word in profanity_variations or matched_word == base_profanity:
|
109 |
+
|
110 |
+
# Seperate pronouns
|
111 |
+
for addon in addon_words:
|
112 |
+
if base_word.endswith(addon):
|
113 |
+
base_profanity = base_profanity + " " + addon
|
114 |
+
break
|
115 |
|
116 |
+
matched_profanity[base_word] = base_profanity
|
117 |
+
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
return matched_profanity
|
120 |
|
|
|
135 |
|
136 |
for index, word in enumerate(row_split):
|
137 |
|
|
|
|
|
|
|
|
|
|
|
138 |
# Remove links
|
139 |
if 'http' in word:
|
140 |
row_split[index] = ''
|
|
|
172 |
print(prediction)
|
173 |
error_message = prediction['error']
|
174 |
return error_message, {}
|
175 |
+
|
176 |
prediction = prediction[0][0]["label"]
|
177 |
|
178 |
print("\nTWEET:", tweet)
|
179 |
+
print("PROCESSED TWEET:", preprocessed_tweet)
|
180 |
print("DETECTED PROFANITY:", list(profanities.keys()))
|
181 |
print("LABEL:", prediction, "\n")
|
182 |
|
contractions.json
CHANGED
@@ -29,5 +29,8 @@
|
|
29 |
"kelan": "kailan",
|
30 |
"raw": "daw",
|
31 |
"itong": "ito ang",
|
32 |
-
"lng": "lang"
|
|
|
|
|
|
|
33 |
}
|
|
|
29 |
"kelan": "kailan",
|
30 |
"raw": "daw",
|
31 |
"itong": "ito ang",
|
32 |
+
"lng": "lang",
|
33 |
+
"putang ina": "putangina",
|
34 |
+
"tangina" : "tangina",
|
35 |
+
"inamo" : "ina mo"
|
36 |
}
|