import streamlit as st import sparknlp import os import pandas as pd from sparknlp.base import * from sparknlp.annotator import * from pyspark.ml import Pipeline from sparknlp.pretrained import PretrainedPipeline # Page configuration st.set_page_config( layout="wide", page_title="Spark NLP Demos App", initial_sidebar_state="auto" ) # CSS for styling st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def init_spark(): return sparknlp.start() @st.cache_resource def create_pipeline(model): documentAssembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer() \ .setInputCols("document") \ .setOutputCol("token") sequenceClassifier_loaded = BertForSequenceClassification.pretrained("bert_classifier_toxic","en") \ .setInputCols(["document", "token"]) \ .setOutputCol("class") pipeline = Pipeline(stages=[documentAssembler, tokenizer,sequenceClassifier_loaded]) return pipeline def fit_data(pipeline, data): empty_df = spark.createDataFrame([['']]).toDF('text') pipeline_model = pipeline.fit(empty_df) model = LightPipeline(pipeline_model) results = model.fullAnnotate(data)[0] return results['class'][0].result # Set up the page layout st.markdown('
State-of-the-Art Detecting Toxic Comments with Spark NLP
', unsafe_allow_html=True) # Sidebar content model = st.sidebar.selectbox( "Choose the pretrained model", ["bert_classifier_toxic"], help="For more info about the models visit: https://sparknlp.org/models" ) # Reference notebook link in sidebar link = """ Open In Colab """ st.sidebar.markdown('Reference notebook:') st.sidebar.markdown(link, unsafe_allow_html=True) # Load examples examples = [ "Streatham again Thank you for the copyedits I have been considering withdrawing the nomination owing to the nonconstructive arguing with a certain reviewer but your edits make me think I cant let everyone down Thank you", "Do you know how pathetic you look with your silly little crusade of removing England or English with articles? Grow up and find something better to do with your life.", "it is obvius to, me, a black man, that orange is for da black Gs, blue is for mexicans and yellow is 4 whites. coz whites look like woman. and mexicans try to compete with da nigger Gs but we r too strong. but in realty the whites also lose to da black man.", "She should stop sticking her tongue out before someone rubs their ass on it. Filthy bitch!!!", "personal attacks dont tell me what i can and can not do i'll come over there right now and throw a brick through ur window", "First i vandalized his page, then i vandalized again, and then his IP adress was blocked hahahahahahahahahahahahahahahahahahaha. This plan is so evil and yet so much fun.", "Silly guy. You should use email like the wikijews", """Quick question about the section re-ordering... Why was Antagonists switched with Supporting, and why were all ""bad"" characters removed from the Supporting section? (converse) " """, "I will respond until Armenian scum will stop", "Isn't tart a kind of whore or concubine or courtesane Eg she's a politician's tart", "....thats why I kill you and your father and your mother and you brother", "You're ridiculous It's obvious that you have a problem with people with money and education. Leave the Horace Mann page alone. You know nothing of the school whatsoever. HM doesn't admit dirt." ] st.subheader("Classify comments and tweets into Toxic, Insults, Hate, Obscene, Threat.") selected_text = st.selectbox("Select a sample", examples) custom_input = st.text_input("Try it for yourself!") if custom_input: selected_text = custom_input elif selected_text: selected_text = selected_text st.subheader('Selected Text') st.write(selected_text) # Initialize Spark and create pipeline spark = init_spark() pipeline = create_pipeline(model) output = fit_data(pipeline, selected_text) # Display output sentence if output == 'severe_toxic': st.markdown("""

This seems like a {} tweet. 🤬

""".format(out), unsafe_allow_html=True) elif output == 'toxic': st.markdown("""

This seems like a {} tweet. 🤬

""".format(output), unsafe_allow_html=True) elif output == 'insult': st.markdown("""

This seems like an {} tweet. 😰

""".format('insulting'), unsafe_allow_html=True) elif output == 'identity_hate': st.markdown("""

This seems like a {} tweet. 😰

""".format(output), unsafe_allow_html=True) elif output == 'obscene': st.markdown("""

This seems like an {} tweet. 🤬

""".format(output), unsafe_allow_html=True) elif output == 'threat': st.markdown("""

This seems like a {} tweet. 🤬

""".format('threatening'), unsafe_allow_html=True)