File size: 5,935 Bytes
2facf4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import streamlit as st
import sparknlp
import os
import pandas as pd

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline

# Page configuration
st.set_page_config(
    layout="wide", 
    page_title="Spark NLP Demos App", 
    initial_sidebar_state="auto"
)

# CSS for styling
st.markdown("""

    <style>

        .main-title {

            font-size: 36px;

            color: #4A90E2;

            font-weight: bold;

            text-align: center;

        }

        .section p, .section ul {

            color: #666666;

        }

    </style>

""", unsafe_allow_html=True)

@st.cache_resource
def init_spark():
    return sparknlp.start()

@st.cache_resource
def create_pipeline(model):
  documentAssembler = DocumentAssembler() \
        .setInputCol("text") \
        .setOutputCol("document")

  tokenizer = Tokenizer() \
      .setInputCols("document") \
      .setOutputCol("token")

  sequenceClassifier_loaded = BertForSequenceClassification.pretrained("bert_classifier_toxic","en") \
      .setInputCols(["document", "token"]) \
      .setOutputCol("class")

  pipeline = Pipeline(stages=[documentAssembler, tokenizer,sequenceClassifier_loaded])

  return pipeline

def fit_data(pipeline, data):
    empty_df = spark.createDataFrame([['']]).toDF('text')
    pipeline_model = pipeline.fit(empty_df)
    model = LightPipeline(pipeline_model)
    results = model.fullAnnotate(data)[0]
    return results['class'][0].result

# Set up the page layout
st.markdown('<div class="main-title">State-of-the-Art Detecting Toxic Comments with Spark NLP</div>', unsafe_allow_html=True)

# Sidebar content
model = st.sidebar.selectbox(
    "Choose the pretrained model",
    ["bert_classifier_toxic"],
    help="For more info about the models visit: https://sparknlp.org/models"
)

# Reference notebook link in sidebar
link = """

<a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/BertForSequenceClassification.ipynb">

    <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>

</a>

"""
st.sidebar.markdown('Reference notebook:')
st.sidebar.markdown(link, unsafe_allow_html=True)

# Load examples
examples = [
  "Streatham again Thank you for the copyedits I have been considering withdrawing the nomination owing to the nonconstructive arguing with a certain reviewer but your edits make me think I cant let everyone down Thank you",
  "Do you know how pathetic you look with your silly little crusade of removing England or English with articles? Grow up and find something better to do with your life.",
  "it is obvius to, me, a black man, that orange is for da black Gs, blue is for mexicans and yellow is 4 whites. coz whites look like woman. and mexicans try to compete with da nigger Gs but we r too strong. but in realty the whites also lose to da black man.",
  "She should stop sticking her tongue out before someone rubs their ass on it. Filthy bitch!!!",
  "personal attacks dont tell me what i can and can not do i'll come over there right now and throw a brick through ur window",
  "First i vandalized his page, then i vandalized again, and then his IP adress was blocked hahahahahahahahahahahahahahahahahahaha. This plan is so evil and yet so much fun.",
  "Silly guy. You should use email like the wikijews",
  """Quick question about the section re-ordering... Why was Antagonists switched with Supporting, and why were all ""bad"" characters removed from the Supporting section?  (converse) " """,
  "I will respond until Armenian scum will stop",
  "Isn't tart a kind of whore or concubine or courtesane Eg she's a politician's tart",
  "....thats why I kill you and your father and your mother and you brother",
  "You're ridiculous It's obvious that you have a problem with people with money and education.  Leave the Horace Mann page alone.  You know nothing of the school whatsoever.  HM doesn't admit dirt."
    
]

st.subheader("Classify comments and tweets into Toxic, Insults, Hate, Obscene, Threat.")

selected_text = st.selectbox("Select a sample", examples)
custom_input = st.text_input("Try it for yourself!")

if custom_input:
    selected_text = custom_input
elif selected_text:
    selected_text = selected_text

st.subheader('Selected Text')
st.write(selected_text)

# Initialize Spark and create pipeline
spark = init_spark()
pipeline = create_pipeline(model)
output = fit_data(pipeline, selected_text)

# Display output sentence
if output == 'severe_toxic':
  st.markdown("""<h3>This seems like a <span style="color: #209DDC">{}</span> tweet. <span style="font-size:35px;">&#129324;</span></h3>""".format(out), unsafe_allow_html=True)
elif output == 'toxic':
  st.markdown("""<h3>This seems like a <span style="color: #B64434">{}</span> tweet. <span style="font-size:35px;">&#129324;</span></h3>""".format(output), unsafe_allow_html=True)
elif output == 'insult':
  st.markdown("""<h3>This seems like an <span style="color: #B64434">{}</span> tweet. <span style="font-size:35px;">&#128560;</span></h3>""".format('insulting'), unsafe_allow_html=True)
elif output == 'identity_hate':
  st.markdown("""<h3>This seems like a <span style="color: #B64434">{}</span> tweet. <span style="font-size:35px;">&#128560;</span></h3>""".format(output), unsafe_allow_html=True)
elif output == 'obscene':
  st.markdown("""<h3>This seems like an <span style="color: #B64434">{}</span> tweet. <span style="font-size:35px;">&#129324;</span></h3>""".format(output), unsafe_allow_html=True)
elif output == 'threat':
  st.markdown("""<h3>This seems like a <span style="color: #B64434">{}</span> tweet. <span style="font-size:35px;">&#129324;</span></h3>""".format('threatening'), unsafe_allow_html=True)