File size: 4,607 Bytes
34fbcfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import gradio as gr
import pandas as pd
import requests
import emoji
import re

API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
headers = {"Authorization": "Bearer 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"}

profanities = ['bobo', 'bobong', 'bwiset', 'bwisit', 'buwisit', 'buwiset', 'bwesit', 'gago', 'gagong', 'kupal',
               'pakshet', 'pakyu', 'pucha', 'puchang',
               'punyeta', 'punyetang', 'puta', 'putang', 'putangina', 'putanginang', 'tanga', 'tangang', 'tangina',
               'tanginang', 'tarantado', 'tarantadong', 'ulol']

contractions = {
    'di': 'hindi',
    'to': 'ito',
    'no': 'ano',
    'kundi': 'kung hindi',
    'nya': 'niya',
    'nyo': 'ninyo',
    'niyo': 'ninyo',
    'pano': 'paano',
    'sainyo': 'sa inyo',
    'sayo': 'sa iyo',
    'pag': 'kapag',
    'kesa': 'kaysa',
    'dun': 'doon',
    'ganto': 'ganito',
    'nandun': 'nandoon',
    'saka': 'tsaka',
    'ung': 'yung',
    'wag': 'huwag',
    'sya': 'siya',
    'bat': 'bakit',
    'yon': 'iyon',
    'yun': 'iyon',
    'dyan': 'diyan',
    'jan': 'diyan',
    'andito': 'nandito',
    'tanginamo': 'tangina mo',
    'putanginamo': 'putangina mo',
    'san': 'saan',
    'ganun': 'ganoon',
    'gagong': 'gago na',
    'bobong': 'bobo na',
    'tangang': 'tanga na',
    'kelan': 'kailan',
    'raw': 'daw',
    'tanginang': 'tangina na',
    'tarantadong': 'tarantado na',
    'putang ina': 'putangina',
    'putang inang': 'putangina',
    'putanginang': 'putangina',
    'itong': 'ito ang',
    'lng': 'lang',
    'bwisit': 'bwiset',
    'bwesit': 'bwiset',
    'buwisit': 'bwiset',
    'buwesit': 'bwiset'
}


def preprocess(row):
    laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
    symbols = ['@', '#']

    # Lowercase
    row = row.lower()

    # Remove emojis
    row = emoji.replace_emoji(row, replace='')

    # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
    row = re.sub(r'(.)\1{2,}', r'\1', row)

    # Split sentence into list of words
    row_split = row.split()

    for index, word in enumerate(row_split):

        # Remove words with symbols (e.g. @username, #hashtags)
        if any(x in word for x in symbols):
            row_split[index] = ''

        # Remove links
        if 'http' in word:
            row_split[index] = ''

        # Unify laugh texts format to 'haha'
        if any(x in word for x in laugh_texts):
            row_split[index] = 'haha'

        # Remove words with digits (4ever)
        if any(x.isdigit() for x in word):
            row_split[index] = ''

    # Combine list of words back to sentence
    combined_text = ' '.join(filter(None, row_split))

    # Check if output contains single word then return null
    if len(combined_text.split()) == 1:
        return combined_text

    # Filter needed characters
    combined_text = re.sub(r"[^A-Za-z ]+", '', combined_text)

    # Expand Contractions
    for i in contractions.items():
        combined_text = re.sub(rf"\b{i[0]}\b", i[1], combined_text)

    return combined_text


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


def predict(text):
    print(preprocess(text))
    output = query(preprocess(text))[0]
    print(output)
    output = [tuple(i.values()) for i in output]
    output = dict((x, y) for x, y in output)

    predicted_label = list(output.keys())[0]

    if predicted_label == 'Abusive':
        output_text = text
        for i in profanities:
            compiled = re.compile(re.escape(i), re.IGNORECASE)
            output_text = compiled.sub('****', output_text)
        return output, output_text
    else:
        return output, text


hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')


demo = gr.Interface(
    fn=predict,

    inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],

    outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
             gr.components.Text(label='OUTPUT')],

    examples=['Tangina mo naman sobrang yabang mo gago!!๐Ÿ˜ ๐Ÿ˜ค @davidrafael',
              'Napakainit ngayong araw pakshet namaaan!!',
              'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
              'Bobo ka ba? napakadali lang nyan eh... ๐Ÿคก',
              'Uy gago laptrip yung nangyare samen kanina HAHAHA๐Ÿ˜‚๐Ÿ˜‚'],

    allow_flagging="manual",
    flagging_callback=hf_writer,
    flagging_options=['Good bot', 'Bad bot']
)

demo.launch(debug=True)