File size: 6,580 Bytes
34fbcfb
 
 
 
3172d47
 
 
bce56c0
114694a
3172d47
34fbcfb
7a70c71
 
34fbcfb
3172d47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a70c71
 
 
 
 
 
 
 
 
3172d47
6912dca
7a70c71
 
 
 
3172d47
 
 
 
7a70c71
 
 
93baba5
114694a
 
93baba5
7a70c71
 
3172d47
370f6d7
3172d47
 
370f6d7
eb2943c
3172d47
 
 
 
 
7a70c71
3172d47
7a70c71
 
6912dca
3172d47
 
7a70c71
3172d47
 
 
7a70c71
 
3172d47
 
6912dca
 
 
3172d47
7a70c71
3172d47
 
bce56c0
7a70c71
34fbcfb
bce56c0
34fbcfb
 
bce56c0
34fbcfb
 
bce56c0
34fbcfb
 
bce56c0
34fbcfb
 
 
 
 
 
 
 
7a70c71
34fbcfb
 
 
 
7a70c71
34fbcfb
 
7a70c71
 
34fbcfb
 
 
7a70c71
34fbcfb
7a70c71
 
34fbcfb
7a70c71
34fbcfb
 
bce56c0
7a70c71
 
 
 
 
 
 
bce56c0
7a70c71
 
 
 
 
 
 
 
 
16316d5
7a70c71
fe9ff70
48392ea
7a70c71
 
 
 
 
 
bce56c0
7a70c71
 
 
 
 
 
 
 
fe9ff70
7a70c71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34fbcfb
 
 
 
 
 
 
7a70c71
34fbcfb
 
 
 
 
 
 
 
7a70c71
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import gradio as gr
import requests
import emoji
import re
import json
from thefuzz import process, fuzz
import numpy as np
import re
from string import punctuation


API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
headers = {"Authorization": "Bearer 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"}

def read_text(filename, filetype='txt'):
    words = []

    if filetype == 'txt':
        with open(filename + '.txt') as file:
            words = [line.rstrip() for line in file]
            words = list(set(words))
    elif filetype == 'json':
        with open(filename + '.json') as json_file:
            words = json.load(json_file)
    
    return words


contractions = read_text('contractions', 'json')
lookup_words = read_text('lookup_words')
obj_pronouns = read_text('obj_pronouns')
profanities = read_text('profanities', 'json')

def query(text):
    text = {"inputs": text}
    response = requests.post(API_URL, headers=headers, json=text)
    return response.json()


# for profanity in profanities:
#     print(profanity, process.extractOne(profanity, tweet.split(), scorer=fuzz.ratio))


def fuzzy_lookup(tweet):

    matched_profanity = dict()

    # Convert Profanity Dict to List
    lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])

    # Loop each word in tweet
    for word in tweet.split():
        scores = []
        matched_words = []

        # Remove punctuations
        word = word.strip(punctuation)

        # Only get digits and letters then lowercase
        processed_word = re.sub("[^a-zA-Z0-9@]", "", word)
        
        # If word > 4 chars
        if len(processed_word) >= 4:
            # Get fuzzy ratio
            for lookup_word in lookup_words:
                score = fuzz.ratio(processed_word, lookup_word)
                if score >= 70:
                    scores.append(score)
                    matched_words.append(lookup_word)
            if len(scores) > 0:
                max_score_index = np.argmax(scores)
                if matched_words[max_score_index] in lookup_profanity:
                    matched_profanity[word] = matched_words[max_score_index]

    for word, profanity in matched_profanity.items():
        word_split = word.split(profanity[-2:])
        for pronoun in obj_pronouns:
            if len(word_split) > 1:
                if pronoun == word_split[-1]:
                    matched_profanity[word] = matched_profanity + ' ' + pronoun
                    break

    # Replace each profanities by fuzzy lookup result
    for word, profanity in matched_profanity.items():
        tweet = tweet.replace(word, profanity)

    for profanity, prof_varations in profanities.items():
        if len(prof_varations) > 0:
            for prof_variant in prof_varations:
                tweet = tweet.replace(prof_variant, profanity)

    return tweet, matched_profanity


def preprocess(tweet):
    
    # Lowercase
    tweet = tweet.lower()

    # Remove emojis
    tweet = emoji.replace_emoji(tweet, replace='')

    # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
    tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)

    # Split sentence into list of words
    row_split = tweet.split()

    for index, word in enumerate(row_split):

        # Remove links
        if 'http' in word:
            row_split[index] = ''

        # Unify laugh texts format to 'haha'
        laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
        if any(x in word for x in laugh_texts):
            row_split[index] = 'haha'

    # Combine list of words back to sentence
    preprocessed_tweet = ' '.join(filter(None, row_split))

    # Check if output contains single word then return null
    if len(preprocessed_tweet.split()) == 1:
        return preprocessed_tweet

    # Expand Contractions
    for i in contractions.items():
        preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet)

    # Fuzzy Lookup
    preprocessed_tweet, matches = fuzzy_lookup(preprocessed_tweet)

    return preprocessed_tweet, matches 


def predict(tweet):

    preprocessed_tweet, matched_profanity = preprocess(tweet)

    prediction = query(preprocessed_tweet)

    if type(prediction) is dict:
        return "Model is still loading. Try again."
    
    if bool(matched_profanity) == False:
        return "No profanity found."
    
    prediction = [tuple(i.values()) for i in prediction[0]]
    prediction = dict((x, y) for x, y in prediction)
    
    print("\n", tweet)
    print(matched_profanity)
    print(prediction, "\n")

    return prediction


# # def predict(tweet):
    
# #     fuzzy_text, matches = fuzzy_lookup(tweet)
# #     processed_text = preprocess(fuzzy_text)
# #     output = query(processed_text)


# #     if 'error' in output:
# #         return output['error'], 'Error occured. Try again later.', {}
# #     elif len(matches) == 0:
# #         return 'No Profanity Found.', '', {}
# #     else:
# #         output = [tuple(i.values()) for i in output[0]]
# #         output = dict((x, y) for x, y in output)
# #         predicted_label = list(output.keys())[0]

# #         if predicted_label == 'Abusive':
# #             # Censor
# #             for base_word, _ in matches.items():
# #                 mask = '*' * len(base_word)
# #                 compiled = re.compile(re.escape(base_word), re.IGNORECASE)
# #                 tweet = compiled.sub(mask, tweet)
# #                 # tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
# #             return output, tweet, json.dumps(matches)
# #         else:
# #             return output, tweet, json.dumps(matches)

# # # output, tweet, matches = predict('ul0L Sama ng ugali mo pre Tangina uL0l!!!')
# # # print(output, '\n', tweet, '\n', matches)

# # hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')


demo = gr.Interface(
    fn=predict,

    inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],

    outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION")],

    examples=['Tangina mo naman sobrang yabang mo gago!!๐Ÿ˜ ๐Ÿ˜ค @davidrafael',
              'Napakainit ngayong araw pakshet namaaan!!',
              'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
              'Bobo ka ba? napakadali lang nyan eh... ๐Ÿคก',
              'Uy gago laptrip yung nangyare samen kanina HAHAHA๐Ÿ˜‚๐Ÿ˜‚'],
)

demo.launch()