File size: 2,818 Bytes
829eb6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
from transformers import pipeline
import re

sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

moderation_guidelines = """
- Allow positive messages
- Block cuss words
- Allow negative comments about individuals but block negative comments against a community
- Block words: Darren
"""


default_cuss_words = {
    "damn", "hell", "shit", "fuck", "ass", "bastard", "bitch", "bollocks", "bugger", 
    "bullshit", "crap", "dammit", "douche", "dumbass", "faggot", "jackass", "jerk", 
    "motherfucker", "piss", "prick", "slut", "son of a bitch", "twat", "wanker"
}

community_terms = {"religion", "race", "ethnicity", "group", "community", "gender", "china"}

def extract_blocked_words(guidelines):
    """Extract blocked words from guidelines"""
    match = re.search(r"block words:\s*(.*)", guidelines.lower())
    return set(match.group(1).split(",")) if match else set()

def moderate_message(message, guidelines):
    """Moderates a message based on sentiment and dynamic guidelines."""
    
    sentiment = sentiment_pipeline(message)[0]['label']  
    
  
    blocked_words = extract_blocked_words(guidelines)
    
    
    allow_positive = "allow positive" in guidelines.lower()
    block_cuss_words = "block cuss" in guidelines.lower()
    allow_negative_personal = "allow negative comments about individuals" in guidelines.lower()
    block_negative_community = "block negative comments against a community" in guidelines.lower()

  
    words = set(re.findall(r'\w+', message.lower()))  
    if block_cuss_words and (words & default_cuss_words):
        return "❌ Message Blocked: Contains inappropriate language."

    if words & blocked_words:
        return "🚫 Message Blocked: Contains restricted words."
    if sentiment == "POSITIVE" and allow_positive:
        return f"βœ… Allowed (Positive): {message}"

    if sentiment == "NEGATIVE":
        if any(word in message.lower() for word in community_terms) and block_negative_community:
            return "🚫 Message Blocked: Negative content targeting a community."
        elif allow_negative_personal:
            return f"⚠️ Allowed (Negative - Personal Attack): {message}"
    
    return f"βœ… Allowed (Neutral): {message}"

with gr.Blocks() as demo:
    gr.Markdown("### πŸ›‘οΈ AI-Powered Moderation System")

    guidelines_input = gr.Textbox(value=moderation_guidelines, label="Moderation Guidelines (Admins Can Update)", lines=4)

    with gr.Row():
        msg_input = gr.Textbox(label="Enter Message")
        msg_output = gr.Textbox(label="Moderation Result", interactive=False)
    
    moderate_btn = gr.Button("Check Message")
    moderate_btn.click(moderate_message, inputs=[msg_input, guidelines_input], outputs=[msg_output])

demo.launch()