File size: 5,412 Bytes
34fbcfb
 
 
 
3172d47
 
 
bce56c0
40a4fcd
 
 
3172d47
34fbcfb
7a70c71
 
34fbcfb
91caef4
f108b87
 
91caef4
 
3172d47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f108b87
 
3172d47
bdff148
f108b87
bdff148
8adc428
3172d47
33125f0
3172d47
6912dca
7a70c71
fa21182
3172d47
8adc428
f108b87
8adc428
f108b87
8adc428
 
fa21182
8adc428
bdff148
d9ea7b2
f108b87
 
 
8adc428
f108b87
8adc428
 
 
 
f108b87
7a70c71
 
93004e9
8adc428
3172d47
 
8adc428
eb2943c
3172d47
 
 
 
 
fa21182
 
 
 
 
 
 
 
f108b87
3172d47
 
fa21182
7a70c71
bce56c0
 
34fbcfb
fa21182
 
 
 
201dfa5
bce56c0
34fbcfb
bce56c0
34fbcfb
 
 
 
 
 
 
 
7a70c71
34fbcfb
 
 
 
7a70c71
34fbcfb
7a70c71
f108b87
34fbcfb
 
 
7a70c71
34fbcfb
f108b87
34fbcfb
 
93004e9
bce56c0
91caef4
fa21182
7a70c71
fa21182
 
 
7a70c71
f108b87
201dfa5
f108b87
 
 
6cbea5c
f108b87
 
 
fa21182
f108b87
16316d5
fa21182
f108b87
6cbea5c
fe9ff70
201dfa5
6c938dd
 
48392ea
6c938dd
34fbcfb
6cbea5c
34fbcfb
6c938dd
 
 
 
 
f108b87
 
 
 
6c938dd
34fbcfb
58eeaa0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import gradio as gr
import requests
import emoji
import re
import json
from thefuzz import process, fuzz
import numpy as np
import re
import nltk
nltk.download('words')
from nltk.corpus import words


API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
headers = {"Authorization": "Bearer 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"}

def query(text):
    payload = {"inputs": text}
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def read_text(filename, filetype='txt'):
    words = []

    if filetype == 'txt':
        with open(filename + '.txt') as file:
            words = [line.rstrip() for line in file]
            words = list(set(words))
    elif filetype == 'json':
        with open(filename + '.json') as json_file:
            words = json.load(json_file)
    
    return words


contractions = read_text('contractions', 'json')
similar_words = read_text('similar_words')
addon_words = read_text('addon_words')
profanities = read_text('profanities', 'json')
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
lookup_words = list(set(similar_words).union(set(lookup_profanity.tolist())))
eng_words = list(set(words.words()) - set(lookup_profanity))
punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')

# TODO check eng words that are tagalog profanities

def fuzzy_lookup(tweet):

    matched_profanity = dict()

    # tweet = punctuations.sub('', tweet).lower()

    for word in tweet.split():

        word = punctuations.sub('', word).lower()
        base_word =  word
        word = re.sub(r'(.)\1{2,}', r'\1', word)
        
        if word in eng_words:
            continue
        
        for addon in addon_words:
            if word.startswith(addon):
                word = word[len(addon):]
            if word.endswith(addon):
                word = word[:-len(addon)]

        if word.startswith("@") or word.startswith("#"):
            word = word[1:]

        scores = []
        matched_words = []

        if len(word) >= 4:
            # Get fuzzy ratio
            for lookup_word in lookup_words:
                score = fuzz.ratio(word, lookup_word)
                if score >= 70:
                    scores.append(score)
                    matched_words.append(lookup_word)
            if len(scores) > 0:
                max_score_index = np.argmax(scores)
                if matched_words[max_score_index] in lookup_profanity:
                    for base_profanity, profanity_variations in profanities.items():
                        if matched_words[max_score_index] == base_profanity:
                            matched_profanity[base_word] = base_profanity
                            break
                        if matched_words[max_score_index] in profanity_variations:
                            matched_profanity[base_word] = base_profanity
                            break
                        
    return matched_profanity


def preprocess(tweet, profanities):
    
    tweet = tweet.lower()
    tweet = emoji.replace_emoji(tweet, replace='')

    # Replace profanities
    for base_word, matched_word in profanities.items():
        tweet = tweet.replace(base_word, matched_word)

    # Elongated words conversion
    tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)

    row_split = tweet.split()

    for index, word in enumerate(row_split):

        # Remove links
        if 'http' in word:
            row_split[index] = ''

        # Unify laugh texts format to 'haha'
        laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
        if any(x in word for x in laugh_texts):
            row_split[index] = 'haha'

    # Combine list of words back to sentence
    preprocessed_tweet = ' '.join(filter(None, row_split))

    if len(preprocessed_tweet.split()) == 1:
        return preprocessed_tweet

    # Expand Contractions
    for i in contractions.items():
        preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet)

    return preprocessed_tweet



def predict(tweet):
    
    profanities = fuzzy_lookup(tweet)

    if len(profanities) > 0:

        preprocessed_tweet = preprocess(tweet, profanities)

        prediction = query(preprocessed_tweet)

        if type(prediction) == dict: 
            print(prediction)
            error_message = prediction['error']
            return error_message, {}
        prediction = prediction[0][0]["label"]
        
        print("\nTWEET:", tweet)
        print("DETECTED PROFANITY:", list(profanities.keys()))
        print("LABEL:", prediction, "\n")

        return prediction, list(profanities.keys())
    
    return "No Profanity", {}


demo = gr.Interface(
    fn=predict,

    inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],

    outputs=[gr.components.Text(label="PREDICTION"), gr.JSON(label="PROFANITIES")],

    examples=['Tangina mo naman sobrang yabang mo gago!!๐Ÿ˜ ๐Ÿ˜ค @davidrafael',
              'Napakainit ngayong araw pakshet namaaan!!',
              'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
              'Bobo ka ba? napakadali lang nyan eh... ๐Ÿคก',
              'Uy gago laptrip yung nangyare samen kanina HAHAHA๐Ÿ˜‚๐Ÿ˜‚'],

    allow_flagging="never",

    title="Tagalog Profanity Classifier"
)

demo.launch(debug=True)