abdullahmubeen10 commited on
Commit
2facf4c
·
verified ·
1 Parent(s): 34a3085

Upload 26 files

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ primaryColor="#29B4E8"
Demo.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ import pandas as pd
5
+
6
+ from sparknlp.base import *
7
+ from sparknlp.annotator import *
8
+ from pyspark.ml import Pipeline
9
+ from sparknlp.pretrained import PretrainedPipeline
10
+
11
+ # Page configuration
12
+ st.set_page_config(
13
+ layout="wide",
14
+ page_title="Spark NLP Demos App",
15
+ initial_sidebar_state="auto"
16
+ )
17
+
18
+ # CSS for styling
19
+ st.markdown("""
20
+ <style>
21
+ .main-title {
22
+ font-size: 36px;
23
+ color: #4A90E2;
24
+ font-weight: bold;
25
+ text-align: center;
26
+ }
27
+ .section p, .section ul {
28
+ color: #666666;
29
+ }
30
+ </style>
31
+ """, unsafe_allow_html=True)
32
+
33
+ @st.cache_resource
34
+ def init_spark():
35
+ return sparknlp.start()
36
+
37
+ @st.cache_resource
38
+ def create_pipeline(model):
39
+ documentAssembler = DocumentAssembler() \
40
+ .setInputCol("text") \
41
+ .setOutputCol("document")
42
+
43
+ tokenizer = Tokenizer() \
44
+ .setInputCols("document") \
45
+ .setOutputCol("token")
46
+
47
+ sequenceClassifier_loaded = BertForSequenceClassification.pretrained("bert_classifier_toxic","en") \
48
+ .setInputCols(["document", "token"]) \
49
+ .setOutputCol("class")
50
+
51
+ pipeline = Pipeline(stages=[documentAssembler, tokenizer,sequenceClassifier_loaded])
52
+
53
+ return pipeline
54
+
55
+ def fit_data(pipeline, data):
56
+ empty_df = spark.createDataFrame([['']]).toDF('text')
57
+ pipeline_model = pipeline.fit(empty_df)
58
+ model = LightPipeline(pipeline_model)
59
+ results = model.fullAnnotate(data)[0]
60
+ return results['class'][0].result
61
+
62
+ # Set up the page layout
63
+ st.markdown('<div class="main-title">State-of-the-Art Detecting Toxic Comments with Spark NLP</div>', unsafe_allow_html=True)
64
+
65
+ # Sidebar content
66
+ model = st.sidebar.selectbox(
67
+ "Choose the pretrained model",
68
+ ["bert_classifier_toxic"],
69
+ help="For more info about the models visit: https://sparknlp.org/models"
70
+ )
71
+
72
+ # Reference notebook link in sidebar
73
+ link = """
74
+ <a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/BertForSequenceClassification.ipynb">
75
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
76
+ </a>
77
+ """
78
+ st.sidebar.markdown('Reference notebook:')
79
+ st.sidebar.markdown(link, unsafe_allow_html=True)
80
+
81
+ # Load examples
82
+ examples = [
83
+ "Streatham again Thank you for the copyedits I have been considering withdrawing the nomination owing to the nonconstructive arguing with a certain reviewer but your edits make me think I cant let everyone down Thank you",
84
+ "Do you know how pathetic you look with your silly little crusade of removing England or English with articles? Grow up and find something better to do with your life.",
85
+ "it is obvius to, me, a black man, that orange is for da black Gs, blue is for mexicans and yellow is 4 whites. coz whites look like woman. and mexicans try to compete with da nigger Gs but we r too strong. but in realty the whites also lose to da black man.",
86
+ "She should stop sticking her tongue out before someone rubs their ass on it. Filthy bitch!!!",
87
+ "personal attacks dont tell me what i can and can not do i'll come over there right now and throw a brick through ur window",
88
+ "First i vandalized his page, then i vandalized again, and then his IP adress was blocked hahahahahahahahahahahahahahahahahahaha. This plan is so evil and yet so much fun.",
89
+ "Silly guy. You should use email like the wikijews",
90
+ """Quick question about the section re-ordering... Why was Antagonists switched with Supporting, and why were all ""bad"" characters removed from the Supporting section? (converse) " """,
91
+ "I will respond until Armenian scum will stop",
92
+ "Isn't tart a kind of whore or concubine or courtesane Eg she's a politician's tart",
93
+ "....thats why I kill you and your father and your mother and you brother",
94
+ "You're ridiculous It's obvious that you have a problem with people with money and education. Leave the Horace Mann page alone. You know nothing of the school whatsoever. HM doesn't admit dirt."
95
+
96
+ ]
97
+
98
+ st.subheader("Classify comments and tweets into Toxic, Insults, Hate, Obscene, Threat.")
99
+
100
+ selected_text = st.selectbox("Select a sample", examples)
101
+ custom_input = st.text_input("Try it for yourself!")
102
+
103
+ if custom_input:
104
+ selected_text = custom_input
105
+ elif selected_text:
106
+ selected_text = selected_text
107
+
108
+ st.subheader('Selected Text')
109
+ st.write(selected_text)
110
+
111
+ # Initialize Spark and create pipeline
112
+ spark = init_spark()
113
+ pipeline = create_pipeline(model)
114
+ output = fit_data(pipeline, selected_text)
115
+
116
+ # Display output sentence
117
+ if output == 'severe_toxic':
118
+ st.markdown("""<h3>This seems like a <span style="color: #209DDC">{}</span> tweet. <span style="font-size:35px;">&#129324;</span></h3>""".format(out), unsafe_allow_html=True)
119
+ elif output == 'toxic':
120
+ st.markdown("""<h3>This seems like a <span style="color: #B64434">{}</span> tweet. <span style="font-size:35px;">&#129324;</span></h3>""".format(output), unsafe_allow_html=True)
121
+ elif output == 'insult':
122
+ st.markdown("""<h3>This seems like an <span style="color: #B64434">{}</span> tweet. <span style="font-size:35px;">&#128560;</span></h3>""".format('insulting'), unsafe_allow_html=True)
123
+ elif output == 'identity_hate':
124
+ st.markdown("""<h3>This seems like a <span style="color: #B64434">{}</span> tweet. <span style="font-size:35px;">&#128560;</span></h3>""".format(output), unsafe_allow_html=True)
125
+ elif output == 'obscene':
126
+ st.markdown("""<h3>This seems like an <span style="color: #B64434">{}</span> tweet. <span style="font-size:35px;">&#129324;</span></h3>""".format(output), unsafe_allow_html=True)
127
+ elif output == 'threat':
128
+ st.markdown("""<h3>This seems like a <span style="color: #B64434">{}</span> tweet. <span style="font-size:35px;">&#129324;</span></h3>""".format('threatening'), unsafe_allow_html=True)
Dockerfile ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Download base image ubuntu 18.04
2
+ FROM ubuntu:18.04
3
+
4
+ # Set environment variables
5
+ ENV NB_USER jovyan
6
+ ENV NB_UID 1000
7
+ ENV HOME /home/${NB_USER}
8
+
9
+ # Install required packages
10
+ RUN apt-get update && apt-get install -y \
11
+ tar \
12
+ wget \
13
+ bash \
14
+ rsync \
15
+ gcc \
16
+ libfreetype6-dev \
17
+ libhdf5-serial-dev \
18
+ libpng-dev \
19
+ libzmq3-dev \
20
+ python3 \
21
+ python3-dev \
22
+ python3-pip \
23
+ unzip \
24
+ pkg-config \
25
+ software-properties-common \
26
+ graphviz \
27
+ openjdk-8-jdk \
28
+ ant \
29
+ ca-certificates-java \
30
+ && apt-get clean \
31
+ && update-ca-certificates -f;
32
+
33
+ # Install Python 3.8 and pip
34
+ RUN add-apt-repository ppa:deadsnakes/ppa \
35
+ && apt-get update \
36
+ && apt-get install -y python3.8 python3-pip \
37
+ && apt-get clean;
38
+
39
+ # Set up JAVA_HOME
40
+ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
41
+ RUN mkdir -p ${HOME} \
42
+ && echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
43
+ && chown -R ${NB_UID}:${NB_UID} ${HOME}
44
+
45
+ # Create a new user named "jovyan" with user ID 1000
46
+ RUN useradd -m -u ${NB_UID} ${NB_USER}
47
+
48
+ # Switch to the "jovyan" user
49
+ USER ${NB_USER}
50
+
51
+ # Set home and path variables for the user
52
+ ENV HOME=/home/${NB_USER} \
53
+ PATH=/home/${NB_USER}/.local/bin:$PATH
54
+
55
+ # Set the working directory to the user's home directory
56
+ WORKDIR ${HOME}
57
+
58
+ # Upgrade pip and install Python dependencies
59
+ RUN python3.8 -m pip install --upgrade pip
60
+ COPY requirements.txt /tmp/requirements.txt
61
+ RUN python3.8 -m pip install -r /tmp/requirements.txt
62
+
63
+ # Copy the application code into the container at /home/jovyan
64
+ COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
65
+
66
+ # Expose port for Streamlit
67
+ EXPOSE 7860
68
+
69
+ # Define the entry point for the container
70
+ ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
images/johnsnowlabs-toxic-output.png ADDED
inputs/sentimentdl_use_imdb/Example1.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Demonicus is a movie turned into a video game! I just love the story and the things that goes on in ...
2
+ Demonicus is a movie turned into a video game! I just love the story and the things that goes on in the film.It is a B-film ofcourse but that doesn`t bother one bit because its made just right and the music was rad! Horror and sword fight freaks,buy this movie now!
inputs/sentimentdl_use_imdb/Example10.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ This is to the Zatoichi movies as the "Star Trek" movies were to "Star Trek"--except that in this ca...
2
+ This is to the Zatoichi movies as the "Star Trek" movies were to "Star Trek"--except that in this case every one of the originals was more entertaining and interesting than this big, shiny re-do, and also better made, if substance is more important than surface. Had I never seen them, I would have thought this good-looking but empty; since I had, I thought its style inappropriate and its content insufficient. The idea of reviving the character in a bigger, slicker production must have sounded good, but there was no point in it, other than the hope of making money; it's just a show, which mostly fails to capture the atmosphere of the character's world and wholly fails to take the character anywhere he hasn't been already (also, the actor wasn't at his best). I'd been hoping to see Ichi at a late stage of life, in a story that would see him out gracefully and draw some conclusion from his experience overall; this just rehashes bits and pieces from the other movies, seasoned with more sex and sfx violence. Not the same experience at all.
inputs/sentimentdl_use_imdb/Example2.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Back when Alec Baldwin and Kim Basinger were a mercurial, hot-tempered, high-powered Hollywood coupl...
2
+ Back when Alec Baldwin and Kim Basinger were a mercurial, hot-tempered, high-powered Hollywood couple they filmed this (nearly) scene-for-scene remake of the 1972 Steve McQueen-Ali MacGraw action-thriller about a fugitive twosome. It almost worked the first time because McQueen was such a vital presence on the screen--even stone silent and weary, you could sense his clock ticking, his cagey magnetism. Baldwin is not in Steve McQueen's league, but he has his charms and is probably a more versatile actor--if so, this is not a showcase for his attributes. Basinger does well and certainly looks good, but James Woods is artificially hammy in a silly mob-magnet role. A sub-plot involving another couple taken hostage by Baldwin's ex-partner was unbearable in the '72 film and plays even worse here. As for the action scenes, they're pretty old hat, which causes one to wonder: why even remake the original?
inputs/sentimentdl_use_imdb/Example3.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Despite a tight narrative, Johnnie To's Election feels at times like it was once a longer picture, w...
2
+ Despite a tight narrative, Johnnie To's Election feels at times like it was once a longer picture, with many characters and plot strands abandoned or ultimately unresolved. Some of these are dealt with in the truly excellent and far superior sequel, Election 2: Harmony is a Virtue, but it's still a dependably enthralling thriller about a contested Triad election that bypasses the usual shootouts and explosions (though not the violence) in favour of constantly shifting alliances that can turn in the time it takes to make a phone call. It's also a film where the most ruthless character isn't always the most threatening one, as the chilling ending makes only too clear: one can imagine a lifetime of psychological counselling being necessary for all the trauma that one inflicts on one unfortunate bystander. Simon Yam, all too often a variable actor but always at his best under To's direction, has possibly never been better in the lead, not least because Tony Leung's much more extrovert performance makes his stillness more the powerful.
inputs/sentimentdl_use_imdb/Example4.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ This movie has successfully proved what we all already know, that professional basket-ball players s...
2
+ This movie has successfully proved what we all already know, that professional basket-ball players suck at everything besides playing basket-ball. Especially rapping and acting. I can not even begin to describe how bad this movie truly is. First of all, is it just me, or is that the ugliest kid you have ever seen? I mean, his teeth could be used as a can-opener. Secondly, why would a genie want to pursue a career in the music industry when, even though he has magical powers, he sucks horribly at making music? Third, I have read the Bible. In no way shape or form did it say that Jesus made genies. Fourth, what was the deal with all the crappy special effects? I assure you that any acne-addled nerdy teenager with a computer could make better effects than that. Fifth, why did the ending suck so badly? And what the hell is a djin? And finally, whoever created the nightmare known as Kazaam needs to be thrown off of a plane and onto the Eiffel Tower, because this movie take the word "suck" to an entirely new level.
inputs/sentimentdl_use_imdb/Example5.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ The fluttering of butterfly wings in the Atlantic can unleash a hurricane in the Pacific. According ...
2
+ The fluttering of butterfly wings in the Atlantic can unleash a hurricane in the Pacific. According to this theory (somehow related to the Chaos Theory, I'm not sure exactly how), every action, no matter how small or insignificant, will start a chain reaction that can lead to big events. This small jewel of a film shows us a series of seemingly-unrelated characters, most of them in Paris, whose actions will affect each others' lives. (The six-degrees-of-separation theory can be applied as well.) Each story is a facet of the jewel that is this film. The acting is finely-tuned and nuanced (Audrey Tautou is luminous), the stories mesh plausibly, the humor is just right, and the viewer leaves the theatre nodding in agreement.
inputs/sentimentdl_use_imdb/Example6.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ There have been very few films I have not been able to sit through. I made it through Battle Field E...
2
+ There have been very few films I have not been able to sit through. I made it through Battle Field Earth no problem. But this, This is one of the single worst films EVER to be made. I understand Whoopi Goldberg tried to get of acting in it. I do not blame her. I would feel ashamed to have this on a resume. I belive it is a rare occasion when almost every gag in a film falls flat on it's face. Well it happens here. Not to mention the SFX, look for the dino with the control cables hanging out of it rear end!!!!!! Halfway through the film I was still looking for a plot. I never found one. Save yourself the trouble of renting this and save 90 minutes of your life.
inputs/sentimentdl_use_imdb/Example7.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ After a long hard week behind the desk making all those dam serious decisions this movie is a great ...
2
+ After a long hard week behind the desk making all those dam serious decisions this movie is a great way to relax. Like Wells and the original radio broadcast this movie will take you away to a land of alien humor and sci-fi paraday. 'Captain Zippo died in the great charge of the Buick. He was a brave man.' The Jack Nicholson impressions shine right through that alien face with the dark sun glasses and leather jacket. And always remember to beware of the 'doughnut of death!' Keep in mind the number one rule of this movie - suspension of disbelief - sit back and relax - and 'Prepare to die Earth Scum!' You just have to see it for yourself.
inputs/sentimentdl_use_imdb/Example8.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ When Ritchie first burst on to movie scene his films were hailed as funny, witty, well directed and ...
2
+ When Ritchie first burst on to movie scene his films were hailed as funny, witty, well directed and original. If one could compare the hype he had generated with his first two attempts and the almost universal loathing his last two outings have created one should consider - has Ritchie been found out? Is he really that talented? Does he really have any genuine original ideas? Or is he simply a pretentious and egotistical director who really wants to be Fincher, Tarantino and Leone all rolled into one colossal and disorganised heap? After watching Revolver one could be excused for thinking were did it all go wrong? What happened to his great sense of humour? Where did he get all these mixed and convoluted ideas from? Revolver tries to be clever, philosophical and succinct, it tries to be an intelligent psychoanalysis, it tries to be an intricate and complicated thriller. Ritchie does make a gargantuan effort to fulfil all these many objectives and invests great chunks of a script into existential musings and numerous plot twists. However, in the end all it serves is to construct a severely disjointed, unstructured and ultimately unfriendly film to the audience. Its plagiarism is so sinful and blatant that although Ritchie does at least attempt to give his own spin he should be punished for even trying to pass it off as his own work. So what the audience gets ultimately is a terrible screenplay intertwined with many pretentious oneliners and clumsy setpieces.<br /><br />Revolver is ultimately an unoriginal and bland movie that has stolen countless themes from masterpieces like Fight Club, Usual Suspects and Pulp Fiction. It aims high, but inevitably shots blanks aplenty.<br /><br />Revolver deserves to be lambasted, it is a truly poor film masquerading as a wannabe masterpiece from a wannabe auteur. However, it falls flat on its farcical face and just fails at everything it wants to be and achieve.
inputs/sentimentdl_use_imdb/Example9.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ I always thought this would be a long and boring Talking-Heads flick full of static interior takes, ...
2
+ I always thought this would be a long and boring Talking-Heads flick full of static interior takes, dude, I was wrong. "Election" is a highly fascinating and thoroughly captivating thriller-drama, taking a deep and realistic view behind the origins of Triads-Rituals. Characters are constantly on the move, and although as a viewer you kinda always remain an outsider, it's still possible to feel the suspense coming from certain decisions and ambitions of the characters. Furthermore Johnnie To succeeds in creating some truly opulent images due to meticulously composed lighting and atmospheric light-shadow contrasts. Although there's hardly any action, the ending is still shocking in it's ruthless depicting of brutality. Cool movie that deserves more attention, and I came to like the minimalistic acoustic guitar score quite a bit.
inputs/sentimentdl_use_twitter/Example1.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ @Mbjthegreat i really dont want AT&amp;T phone service..they suck when it comes to having a signal...
2
+ @Mbjthegreat i really dont want AT&amp;T phone service..they suck when it comes to having a signal
inputs/sentimentdl_use_twitter/Example10.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ @PDubyaD right!!! LOL we'll get there!! I have high expectations, Warren Buffet style....
2
+ @PDubyaD right!!! LOL we'll get there!! I have high expectations, Warren Buffet style.
inputs/sentimentdl_use_twitter/Example2.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ holy crap. I take a nap for 4 hours and Pitchfork blows up my twitter dashboard. I wish I was at Coa...
2
+ holy crap. I take a nap for 4 hours and Pitchfork blows up my twitter dashboard. I wish I was at Coachella.
inputs/sentimentdl_use_twitter/Example3.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ @Susy412 he is working today ive tried that still not working..... hmmmm!! im rubbish with computer...
2
+ @Susy412 he is working today ive tried that still not working..... hmmmm!! im rubbish with computers haha!
inputs/sentimentdl_use_twitter/Example4.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Brand New Canon EOS 50D 15MP DSLR Camera Canon 17-85mm IS Lens ...: Web Technology Thread, Brand New...
2
+ Brand New Canon EOS 50D 15MP DSLR Camera Canon 17-85mm IS Lens ...: Web Technology Thread, Brand New Canon EOS 5.. http://u.mavrev.com/5a3t
inputs/sentimentdl_use_twitter/Example5.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Watching a programme about the life of Hitler, its only enhancing my geekiness of history....
2
+ Watching a programme about the life of Hitler, its only enhancing my geekiness of history.
inputs/sentimentdl_use_twitter/Example6.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ GM says expects announcment on sale of Hummer soon - Reuters: WDSUGM says expects announcment on sal...
2
+ GM says expects announcment on sale of Hummer soon - Reuters: WDSUGM says expects announcment on sale of Hummer .. http://bit.ly/4E1Fv
inputs/sentimentdl_use_twitter/Example7.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ @accannis @edog1203 Great Stanford course. Thanks for making it available to the public! Really help...
2
+ @accannis @edog1203 Great Stanford course. Thanks for making it available to the public! Really helpful and informative for starting off!
inputs/sentimentdl_use_twitter/Example8.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ @the_real_usher LeBron is cool. I like his personality...he has good character....
2
+ @the_real_usher LeBron is cool. I like his personality...he has good character.
inputs/sentimentdl_use_twitter/Example9.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ @sketchbug Lebron is a hometown hero to me, lol I love the Lakers but let's go Cavs, lol...
2
+ @sketchbug Lebron is a hometown hero to me, lol I love the Lakers but let's go Cavs, lol
pages/Workflow & Model Overview.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Custom CSS for better styling
4
+ st.markdown("""
5
+ <style>
6
+ .main-title {
7
+ font-size: 36px;
8
+ color: #4A90E2;
9
+ font-weight: bold;
10
+ text-align: center;
11
+ }
12
+ .sub-title {
13
+ font-size: 24px;
14
+ color: #4A90E2;
15
+ margin-top: 20px;
16
+ }
17
+ .section {
18
+ background-color: #f9f9f9;
19
+ padding: 15px;
20
+ border-radius: 10px;
21
+ margin-top: 20px;
22
+ }
23
+ .section h2 {
24
+ font-size: 22px;
25
+ color: #4A90E2;
26
+ }
27
+ .section p, .section ul {
28
+ color: #666666;
29
+ }
30
+ .link {
31
+ color: #4A90E2;
32
+ text-decoration: none;
33
+ }
34
+ </style>
35
+ """, unsafe_allow_html=True)
36
+
37
+ # Introduction
38
+ st.markdown('<div class="main-title">Detecting Toxic Comments with Spark NLP</div>', unsafe_allow_html=True)
39
+
40
+ st.markdown("""
41
+ <div class="section">
42
+ <p>Welcome to the Spark NLP Toxic Comment Detection Demo App! Discussing things you care about can be difficult. The threat of abuse and harassment online means that many people stop expressing themselves and give up on seeking different opinions. Platforms struggle to effectively facilitate conversations, leading many communities to limit or completely shut down user comments.</p>
43
+ <p>This app demonstrates how to use Spark NLP's MulticlassifierDL to automatically detect toxic comments, including categories like identity hate, insult, obscene, severe toxic, and threat.</p>
44
+ </div>
45
+ """, unsafe_allow_html=True)
46
+
47
+ # st.image('images/Toxic-Comments.jpg', caption="Different types of toxic comments detected using Spark NLP", use_column_width='auto')
48
+
49
+ # About Toxic Comment Classification
50
+ st.markdown('<div class="sub-title">About Toxic Comment Classification</div>', unsafe_allow_html=True)
51
+ st.markdown("""
52
+ <div class="section">
53
+ <p>The Conversation AI team, a research initiative founded by Jigsaw and Google (both part of Alphabet), is working on tools to help improve online conversations. One area of focus is the study of negative online behaviors, like toxic comments (comments that are rude, disrespectful, or likely to make someone leave a discussion).</p>
54
+ <p>This app utilizes the Spark NLP MulticlassifierDL model to detect various types of toxicity in comments. This model is capable of identifying and categorizing toxic comments into different classes such as toxic, severe toxic, identity hate, insult, obscene, and threat.</p>
55
+ </div>
56
+ """, unsafe_allow_html=True)
57
+
58
+ # Using MulticlassifierDL in Spark NLP
59
+ st.markdown('<div class="sub-title">Using MulticlassifierDL in Spark NLP</div>', unsafe_allow_html=True)
60
+ st.markdown("""
61
+ <div class="section">
62
+ <p>The MulticlassifierDL annotator in Spark NLP uses deep learning to classify text into multiple categories. This approach allows for a more nuanced understanding of the toxicity in comments, providing better tools for moderating online discussions.</p>
63
+ <p>Spark NLP also offers other annotators and models for different NLP tasks. If you are interested in exploring more, please check the <a class="link" href="https://nlp.johnsnowlabs.com/docs/en/annotators#multiclassifierdl" target="_blank" rel="noopener">MulticlassifierDL</a> documentation.</p>
64
+ </div>
65
+ """, unsafe_allow_html=True)
66
+
67
+ st.markdown('<h2 class="sub-title">Example Usage in Python</h2>', unsafe_allow_html=True)
68
+ st.markdown('<p>Here’s how you can implement toxic comment classification using the MulticlassifierDL annotator in Spark NLP:</p>', unsafe_allow_html=True)
69
+
70
+ # Setup Instructions
71
+ st.markdown('<div class="sub-title">Setup</div>', unsafe_allow_html=True)
72
+ st.markdown('<p>To install Spark NLP in Python, use your favorite package manager (conda, pip, etc.). For example:</p>', unsafe_allow_html=True)
73
+ st.code("""
74
+ pip install spark-nlp
75
+ pip install pyspark
76
+ """, language="bash")
77
+
78
+ st.markdown("<p>Then, import Spark NLP and start a Spark session:</p>", unsafe_allow_html=True)
79
+ st.code("""
80
+ import sparknlp
81
+
82
+ # Start Spark Session
83
+ spark = sparknlp.start()
84
+ """, language='python')
85
+
86
+ # Toxic Comment Classification Example
87
+ st.markdown('<div class="sub-title">Example Usage: Toxic Comment Classification with MulticlassifierDL</div>', unsafe_allow_html=True)
88
+ st.code('''
89
+ from sparknlp.base import DocumentAssembler
90
+ from sparknlp.annotator import UniversalSentenceEncoder, MultiClassifierDLModel
91
+ from pyspark.ml import Pipeline
92
+
93
+ # Step 1: Transforms raw texts to document annotation
94
+ document = DocumentAssembler() \\
95
+ .setInputCol("text") \\
96
+ .setOutputCol("document")
97
+
98
+ # Step 2: Use Universal Sentence Encoder for embeddings
99
+ use = UniversalSentenceEncoder.pretrained() \\
100
+ .setInputCols(["document"]) \\
101
+ .setOutputCol("use_embeddings")
102
+
103
+ # Step 3: Multiclass classification model
104
+ docClassifier = MultiClassifierDLModel.pretrained("multiclassifierdl_use_toxic") \\
105
+ .setInputCols(["use_embeddings"]) \\
106
+ .setOutputCol("category") \\
107
+ .setThreshold(0.5)
108
+
109
+ # Define the pipeline
110
+ pipeline = Pipeline(
111
+ stages = [
112
+ document,
113
+ use,
114
+ docClassifier
115
+ ]
116
+ )
117
+
118
+ # Create a Spark Data Frame with example sentences
119
+ data = spark.createDataFrame(
120
+ [
121
+ ["She should stop sticking her tongue out before someone rubs their ass on it. Filthy bitch!!!"]
122
+ ]
123
+ ).toDF("text") # use the column name `text` defined in the pipeline as input
124
+
125
+ # Fit-transform to get predictions
126
+ result = pipeline.fit(data).transform(data).select("text", "category.result").show(truncate=50)
127
+ ''', language='python')
128
+
129
+ st.text("""
130
+ +--------------------------------------------------+------------------------+
131
+ | text| result|
132
+ +--------------------------------------------------+------------------------+
133
+ |She should stop sticking her tongue out before ...|[toxic, insult, obscene]|
134
+ +--------------------------------------------------+------------------------+
135
+ """)
136
+
137
+ st.markdown("""
138
+ <p>The code snippet demonstrates how to set up a pipeline in Spark NLP to classify toxic comments using the MulticlassifierDL annotator. The resulting DataFrame contains the predictions for each comment.</p>
139
+ """, unsafe_allow_html=True)
140
+
141
+ # One-liner Alternative
142
+ st.markdown('<div class="sub-title">One-liner Alternative</div>', unsafe_allow_html=True)
143
+ st.markdown("""
144
+ <div class="section">
145
+ <p>In October 2022, John Snow Labs released the open-source <code>johnsnowlabs</code> library that contains all the company products, open-source and licensed, under one common library. This simplified the workflow, especially for users working with more than one of the libraries (e.g., Spark NLP + Healthcare NLP). This new library is a wrapper on all of John Snow Lab’s libraries and can be installed with pip:</p>
146
+ <p><code>pip install johnsnowlabs</code></p>
147
+ </div>
148
+ """, unsafe_allow_html=True)
149
+
150
+ st.markdown('<p>To run toxic comment classification with one line of code, we can simply:</p>', unsafe_allow_html=True)
151
+ st.code("""
152
+ # Import the NLP module which contains Spark NLP and NLU libraries
153
+ from johnsnowlabs import nlp
154
+
155
+ sample_text = ["You are a horrible person!", "I love your new profile picture!", "Go away, no one likes you."]
156
+
157
+ # Returns a pandas DataFrame, we select the desired columns
158
+ nlp.load('en.classify.toxic').predict(sample_text, output_level='sentence')
159
+ """, language='python')
160
+
161
+ st.image('images/johnsnowlabs-toxic-output.png', use_column_width='auto')
162
+
163
+ st.markdown("""
164
+ <p>This approach demonstrates how to use the <code>johnsnowlabs</code> library to perform toxic comment classification with a single line of code. The resulting DataFrame contains the predictions for each comment.</p>
165
+ """, unsafe_allow_html=True)
166
+
167
+ # Benchmarking
168
+ st.markdown('<div class="sub-title">Benchmarking</div>', unsafe_allow_html=True)
169
+ st.markdown("""
170
+ <div class="section">
171
+ <p>Here are the benchmarking results for the MulticlassifierDL model on the toxic comment classification task:</p>
172
+ <pre>
173
+ precision recall f1-score support
174
+
175
+ 0 0.56 0.30 0.39 127
176
+ 1 0.71 0.70 0.70 761
177
+ 2 0.76 0.72 0.74 824
178
+ 3 0.55 0.21 0.31 147
179
+ 4 0.79 0.38 0.51 50
180
+ 5 0.94 1.00 0.97 1504
181
+
182
+ micro avg 0.83 0.80 0.81 3413
183
+ macro avg 0.72 0.55 0.60 3413
184
+ weighted avg 0.81 0.80 0.80 3413
185
+ samples avg 0.84 0.83 0.80 3413
186
+
187
+ F1 micro averaging: 0.8113432835820896
188
+ </div>
189
+ """, unsafe_allow_html=True)
190
+
191
+ # Additional Resources
192
+ st.markdown('<div class="sub-title">Additional Resources</div>', unsafe_allow_html=True)
193
+ st.markdown("""
194
+ <div class="section">
195
+ <ul>
196
+ <li>Python Docs : <a class="link" href="https://nlp.johnsnowlabs.com/docs/en/annotators#multiclassifierdl" target="_blank" rel="noopener">ClassifierDLModel</a></li>
197
+ <li>Model used : <a class="link" href="https://sparknlp.org/2021/01/21/multiclassifierdl_use_toxic_en.html" target="_blank" rel="noopener">multiclassifierdl_use_toxic</a></li>
198
+ <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
199
+ </ul>
200
+ </div>
201
+ """, unsafe_allow_html=True)
202
+
203
+ st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
204
+ st.markdown("""
205
+ <div class="section">
206
+ <ul>
207
+ <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
208
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
209
+ <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
210
+ <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
211
+ </ul>
212
+ </div>
213
+ """, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ spark-nlp
5
+ pyspark