abdullahmubeen10 commited on
Commit
f475ccd
·
verified ·
1 Parent(s): 9c65f75

Upload 9 files

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ primaryColor="#29B4E8"
Demo.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ import pandas as pd
5
+
6
+ from sparknlp.base import *
7
+ from sparknlp.annotator import *
8
+ from pyspark.ml import Pipeline
9
+ from sparknlp.pretrained import PretrainedPipeline
10
+
11
+ # Page configuration
12
+ st.set_page_config(
13
+ layout="wide",
14
+ page_title="Spark NLP Demos App",
15
+ initial_sidebar_state="auto"
16
+ )
17
+
18
+ # CSS for styling
19
+ st.markdown("""
20
+ <style>
21
+ .main-title {
22
+ font-size: 36px;
23
+ color: #4A90E2;
24
+ font-weight: bold;
25
+ text-align: center;
26
+ }
27
+ .section p, .section ul {
28
+ color: #666666;
29
+ }
30
+ </style>
31
+ """, unsafe_allow_html=True)
32
+
33
+ @st.cache_resource
34
+ def init_spark():
35
+ return sparknlp.start()
36
+
37
+ @st.cache_resource
38
+ def create_pipeline(model):
39
+ documentAssembler = DocumentAssembler()\
40
+ .setInputCol("text")\
41
+ .setOutputCol("document")
42
+
43
+ use = UniversalSentenceEncoder.pretrained()\
44
+ .setInputCols(["document"])\
45
+ .setOutputCol("sentence_embeddings")
46
+
47
+
48
+ sentimentdl = ClassifierDLModel.pretrained(model)\
49
+ .setInputCols(["sentence_embeddings"])\
50
+ .setOutputCol("sentiment")
51
+
52
+ nlpPipeline = Pipeline(stages = [documentAssembler, use, sentimentdl])
53
+
54
+ return nlpPipeline
55
+
56
+ def fit_data(pipeline, data):
57
+ empty_df = spark.createDataFrame([['']]).toDF('text')
58
+ pipeline_model = pipeline.fit(empty_df)
59
+ model = LightPipeline(pipeline_model)
60
+ results = model.fullAnnotate(data)[0]
61
+
62
+ return results['sentiment'][0].result
63
+
64
+ # Set up the page layout
65
+ st.markdown('<div class="main-title">Detect Cyberbullying in Tweets with Spark NLP</div>', unsafe_allow_html=True)
66
+
67
+ # Sidebar content
68
+ model = st.sidebar.selectbox(
69
+ "Choose the pretrained model",
70
+ ["classifierdl_use_cyberbullying"],
71
+ help="For more info about the models visit: https://sparknlp.org/models"
72
+ )
73
+
74
+ # Reference notebook link in sidebar
75
+ link = """
76
+ <a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/SENTIMENT_EN_CYBERBULLYING.ipynb">
77
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
78
+ </a>
79
+ """
80
+ st.sidebar.markdown('Reference notebook:')
81
+ st.sidebar.markdown(link, unsafe_allow_html=True)
82
+
83
+ # Load examples
84
+ examples = [
85
+ "@CALMicC he kept me informed on stuff id missed and seemed ok. I liked him.",
86
+ "@AMohedin Okay, we have women being physically inferior and the either emotionally or mentally inferior in some way.",
87
+ "@LynnMagic people think that implying association via follow is a bad thing. but it's shockingly accurate.",
88
+ "@Rayandawlah_ @_Jihad10 These days might and honor come from science, technology, humanitarianism. Which is why Muslims won't get any.",
89
+ "Stay outve Congress and we have a deal. @jacobkramer17 Call me sexist bt the super bowl should b guys only no women are allowed n th stadium",
90
+ "I'm looking for a few people to help with @ggautoblocker's twitter. Log &amp; categorize mentions as support requests/abusive/positive tweets.",
91
+ "@geeky_zekey Thanks for showing again that blacks are the biggest racists. Blocked",
92
+ """@ListenToRaisin No question. Feminists have the media. Did you see any mention of Clem Fords OPEN bigotry, etc? Nope. "Narrative" is all.""",
93
+ "RT @EBeisner @ahall012 I agree with you!! I would rather brush my teeth with sandpaper then watch football with a girl!!",
94
+ "@hibach8 But it is a lie. The religion is a disgusting, terrorist, hate mongering piece of filth. That has nothing to do with individuals."
95
+ ]
96
+
97
+ st.subheader("Identify Racism, Sexism or Neutral tweets using our pretrained emotions detector.")
98
+
99
+ selected_text = st.selectbox("Select a sample", examples)
100
+ custom_input = st.text_input("Try it for yourself!")
101
+
102
+ if custom_input:
103
+ selected_text = custom_input
104
+ elif selected_text:
105
+ selected_text = selected_text
106
+
107
+ st.subheader('Selected Text')
108
+ st.write(selected_text)
109
+
110
+ # Initialize Spark and create pipeline
111
+ spark = init_spark()
112
+ pipeline = create_pipeline(model)
113
+ output = fit_data(pipeline, selected_text)
114
+
115
+ # Display output sentence
116
+ if output.lower() in ['neutral', 'normal']:
117
+ st.markdown("""<h3>This seems like a <span style="color: green">{}</span> tweet. <span style="font-size:35px;">&#128515;</span></h3>""".format(output), unsafe_allow_html=True)
118
+ elif output.lower() in ['racism', 'sexism']:
119
+ st.markdown("""<h3>This seems like a <span style="color: #B64434">{}</span> tweet. <span style="font-size:35px;">&#129324;</span></h3>""".format(output), unsafe_allow_html=True)
120
+
Dockerfile ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Download base image ubuntu 18.04
2
+ FROM ubuntu:18.04
3
+
4
+ # Set environment variables
5
+ ENV NB_USER jovyan
6
+ ENV NB_UID 1000
7
+ ENV HOME /home/${NB_USER}
8
+
9
+ # Install required packages
10
+ RUN apt-get update && apt-get install -y \
11
+ tar \
12
+ wget \
13
+ bash \
14
+ rsync \
15
+ gcc \
16
+ libfreetype6-dev \
17
+ libhdf5-serial-dev \
18
+ libpng-dev \
19
+ libzmq3-dev \
20
+ python3 \
21
+ python3-dev \
22
+ python3-pip \
23
+ unzip \
24
+ pkg-config \
25
+ software-properties-common \
26
+ graphviz \
27
+ openjdk-8-jdk \
28
+ ant \
29
+ ca-certificates-java \
30
+ && apt-get clean \
31
+ && update-ca-certificates -f;
32
+
33
+ # Install Python 3.8 and pip
34
+ RUN add-apt-repository ppa:deadsnakes/ppa \
35
+ && apt-get update \
36
+ && apt-get install -y python3.8 python3-pip \
37
+ && apt-get clean;
38
+
39
+ # Set up JAVA_HOME
40
+ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
41
+ RUN mkdir -p ${HOME} \
42
+ && echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
43
+ && chown -R ${NB_UID}:${NB_UID} ${HOME}
44
+
45
+ # Create a new user named "jovyan" with user ID 1000
46
+ RUN useradd -m -u ${NB_UID} ${NB_USER}
47
+
48
+ # Switch to the "jovyan" user
49
+ USER ${NB_USER}
50
+
51
+ # Set home and path variables for the user
52
+ ENV HOME=/home/${NB_USER} \
53
+ PATH=/home/${NB_USER}/.local/bin:$PATH
54
+
55
+ # Set the working directory to the user's home directory
56
+ WORKDIR ${HOME}
57
+
58
+ # Upgrade pip and install Python dependencies
59
+ RUN python3.8 -m pip install --upgrade pip
60
+ COPY requirements.txt /tmp/requirements.txt
61
+ RUN python3.8 -m pip install -r /tmp/requirements.txt
62
+
63
+ # Copy the application code into the container at /home/jovyan
64
+ COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
65
+
66
+ # Expose port for Streamlit
67
+ EXPOSE 7860
68
+
69
+ # Define the entry point for the container
70
+ ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
images/Cyberbullying.jpeg ADDED
images/Sentiment-Analysis.jpg ADDED
images/dataset.png ADDED
images/johnsnowlabs-sentiment-output.png ADDED
pages/Workflow & Model Overview.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Custom CSS for better styling
4
+ st.markdown("""
5
+ <style>
6
+ .main-title {
7
+ font-size: 36px;
8
+ color: #4A90E2;
9
+ font-weight: bold;
10
+ text-align: center;
11
+ }
12
+ .sub-title {
13
+ font-size: 24px;
14
+ color: #4A90E2;
15
+ margin-top: 20px;
16
+ }
17
+ .section {
18
+ background-color: #f9f9f9;
19
+ padding: 15px;
20
+ border-radius: 10px;
21
+ margin-top: 20px;
22
+ }
23
+ .section h2 {
24
+ font-size: 22px;
25
+ color: #4A90E2;
26
+ }
27
+ .section p, .section ul {
28
+ color: #666666;
29
+ }
30
+ .link {
31
+ color: #4A90E2;
32
+ text-decoration: none;
33
+ }
34
+ </style>
35
+ """, unsafe_allow_html=True)
36
+
37
+ # Introduction
38
+ st.markdown('<div class="main-title">Cyberbullying Detection in Tweets with Spark NLP</div>', unsafe_allow_html=True)
39
+
40
+ st.markdown("""
41
+ <div class="section">
42
+ <p>Welcome to the Spark NLP Cyberbullying Detection Demo App! Detecting cyberbullying in social media posts is crucial to creating a safer online environment. This app demonstrates how to use Spark NLP's powerful tools to identify and classify cyberbullying in tweets.</p>
43
+ </div>
44
+ """, unsafe_allow_html=True)
45
+
46
+ st.write("")
47
+ st.image('images/Cyberbullying.jpeg', use_column_width='auto')
48
+
49
+ # About Cyberbullying Detection
50
+ st.markdown('<div class="sub-title">About Cyberbullying Detection</div>', unsafe_allow_html=True)
51
+ st.markdown("""
52
+ <div class="section">
53
+ <p>Cyberbullying detection involves analyzing text to identify instances of harmful, threatening, or abusive language. Cyberbullying can have severe psychological effects on victims, making it essential to identify and address it promptly. Using Spark NLP, we can build a model to detect and classify cyberbullying in social media posts, helping to mitigate the negative impacts of online harassment.</p>
54
+ </div>
55
+ """, unsafe_allow_html=True)
56
+
57
+ # Using Cyberbullying Detection Model in Spark NLP
58
+ st.markdown('<div class="sub-title">Using Cyberbullying Detection Model in Spark NLP</div>', unsafe_allow_html=True)
59
+ st.markdown("""
60
+ <div class="section">
61
+ <p>The following pipeline uses the Universal Sentence Encoder and a pre-trained ClassifierDL model to detect cyberbullying in tweets. This model can identify various forms of cyberbullying such as racism and sexism.</p>
62
+ </div>
63
+ """, unsafe_allow_html=True)
64
+
65
+ st.markdown('<h2 class="sub-title">Example Usage in Python</h2>', unsafe_allow_html=True)
66
+
67
+ # Setup Instructions
68
+ st.markdown('<div class="sub-title">Setup</div>', unsafe_allow_html=True)
69
+ st.markdown('<p>To install Spark NLP in Python, use your favorite package manager (conda, pip, etc.). For example:</p>', unsafe_allow_html=True)
70
+ st.code("""
71
+ pip install spark-nlp
72
+ pip install pyspark
73
+ """, language="bash")
74
+
75
+ st.markdown("<p>Then, import Spark NLP and start a Spark session:</p>", unsafe_allow_html=True)
76
+ st.code("""
77
+ import sparknlp
78
+
79
+ # Start Spark Session
80
+ spark = sparknlp.start()
81
+ """, language='python')
82
+
83
+ # Cyberbullying Detection Example
84
+ st.markdown('<div class="sub-title">Example Usage: Cyberbullying Detection with Spark NLP</div>', unsafe_allow_html=True)
85
+ st.code('''
86
+ from sparknlp.base import DocumentAssembler, LightPipeline
87
+ from sparknlp.annotator import UniversalSentenceEncoder, ClassifierDLModel
88
+ from pyspark.ml import Pipeline
89
+
90
+ # Step 1: Transforms raw texts to document annotation
91
+ document_assembler = DocumentAssembler()\\
92
+ .setInputCol("text")\\
93
+ .setOutputCol("document")
94
+
95
+ # Step 2: Universal Sentence Encoder
96
+ use = UniversalSentenceEncoder.pretrained('tfhub_use', lang="en") \\
97
+ .setInputCols(["document"])\\
98
+ .setOutputCol("sentence_embeddings")
99
+
100
+ # Step 3: ClassifierDLModel for Cyberbullying Detection
101
+ document_classifier = ClassifierDLModel.pretrained('classifierdl_use_cyberbullying', 'en') \\
102
+ .setInputCols(["sentence_embeddings"]) \\
103
+ .setOutputCol("class")
104
+
105
+ # Define the pipeline
106
+ nlp_pipeline = Pipeline(stages=[document_assembler, use, document_classifier])
107
+
108
+ # Create a light pipeline for prediction
109
+ light_pipeline = LightPipeline(nlp_pipeline.fit(spark.createDataFrame([['']]).toDF("text")))
110
+
111
+ # Predict cyberbullying in a tweet
112
+ annotations = light_pipeline.fullAnnotate('@geeky_zekey Thanks for showing again that blacks are the biggest racists. Blocked')
113
+ print(annotations[0]['class'][0])
114
+ ''', language='python')
115
+
116
+ st.text("""
117
+ Output:
118
+ Annotation(category, 0, 81, racism, {'sentence': '0', 'sexism': '2.4904006E-7', 'neutral': '9.4820876E-5', 'racism': '0.9999049'}, [])
119
+ """)
120
+
121
+ st.markdown("""
122
+ <p>The annotation classifies the text as "racism" with a probability score of 0.9999049, indicating very high confidence, while also providing low probability scores for "sexism" and "neutral."</p>
123
+ """, unsafe_allow_html=True)
124
+
125
+ # Benchmarking Section
126
+ st.markdown('<div class="sub-title">Benchmarking</div>', unsafe_allow_html=True)
127
+ st.markdown("""
128
+ <div class="section">
129
+ <p>The following table summarizes the performance of the Cyberbullying Detection model in terms of precision, recall, and f1-score:</p>
130
+ <pre>
131
+ precision recall f1-score support
132
+
133
+ neutral 0.72 0.76 0.74 700
134
+ racism 0.89 0.94 0.92 773
135
+ sexism 0.82 0.71 0.76 622
136
+
137
+ accuracy 0.81 2095
138
+ macro avg 0.81 0.80 0.80 2095
139
+ weighted avg 0.81 0.81 0.81 2095
140
+ </pre>
141
+ </div>
142
+ """, unsafe_allow_html=True)
143
+
144
+ # Conclusion
145
+ st.markdown("""
146
+ <div class="section">
147
+ <h2>Conclusion</h2>
148
+ <p>In this app, we demonstrated how to use Spark NLP's ClassifierDL model to perform cyberbullying detection on tweet data. These powerful tools enable users to efficiently process large datasets and identify harmful content, providing deeper insights for various applications. By integrating these annotators into your NLP pipelines, you can enhance text understanding, information extraction, and online safety measures.</p>
149
+ </div>
150
+ """, unsafe_allow_html=True)
151
+
152
+ # References and Additional Information
153
+ st.markdown('<div class="sub-title">For additional information, please check the following references.</div>', unsafe_allow_html=True)
154
+
155
+ st.markdown("""
156
+ <div class="section">
157
+ <ul>
158
+ <li>Documentation : <a href="https://nlp.johnsnowlabs.com/docs/en/transformers#classifierdl" target="_blank" rel="noopener">ClassifierDLModel</a></li>
159
+ <li>Python Docs : <a href="https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp/annotator/classifierdl/index.html#sparknlp.annotator.classifierdl.ClassifierDLModel" target="_blank" rel="noopener">ClassifierDLModel</a></li>
160
+ <li>Model Used : <a href="https://sparknlp.org/2021/01/09/classifierdl_use_cyberbullying_en.html" target="_blank" rel="noopener">classifierdl_use_cyberbullying</a></li>
161
+ </ul>
162
+ </div>
163
+ """, unsafe_allow_html=True)
164
+
165
+ st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
166
+ st.markdown("""
167
+ <div class="section">
168
+ <ul>
169
+ <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
170
+ <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
171
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
172
+ <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
173
+ <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
174
+ </ul>
175
+ </div>
176
+ """, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ spark-nlp
5
+ pyspark