File size: 5,056 Bytes
0b76f34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import streamlit as st
import sparknlp
import os
import pandas as pd
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline
# Page configuration
st.set_page_config(
layout="wide",
initial_sidebar_state="auto"
)
# CSS for styling
st.markdown("""
<style>
.main-title {
font-size: 36px;
color: #4A90E2;
font-weight: bold;
text-align: center;
}
.section p, .section ul {
color: #666666;
}
</style>
""", unsafe_allow_html=True)
@st.cache_resource
def init_spark():
return sparknlp.start()
@st.cache_resource
def create_pipeline(model):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")
normalizer = Normalizer() \
.setInputCols(["token"]) \
.setOutputCol("normalized")
stopwords_cleaner = StopWordsCleaner.pretrained("stopwords_sw", "sw") \
.setInputCols(["normalized"]) \
.setOutputCol("cleanTokens")\
.setCaseSensitive(False)
embeddings = XlmRoBertaEmbeddings.pretrained("xlm_roberta_base_finetuned_swahili", "sw")\
.setInputCols(["document", "cleanTokens"])\
.setOutputCol("embeddings")
embeddingsSentence = SentenceEmbeddings() \
.setInputCols(["document", "embeddings"]) \
.setOutputCol("sentence_embeddings") \
.setPoolingStrategy("AVERAGE")
sentimentClassifier = ClassifierDLModel.pretrained("classifierdl_xlm_roberta_sentiment", "sw") \
.setInputCols(["sentence_embeddings"]) \
.setOutputCol("class_")
sw_pipeline = Pipeline(
stages=[
document_assembler,
tokenizer,
normalizer,
stopwords_cleaner,
embeddings,
embeddingsSentence,
sentimentClassifier
])
return sw_pipeline
def fit_data(pipeline, data):
empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = pipeline.fit(empty_df)
model = LightPipeline(pipeline_model)
results = model.fullAnnotate(data)[0]
return results['class_'][0].result
# Set up the page layout
st.markdown('<div class="main-title">State-of-the-Art Swahili Sentiment Detection with Spark NLP</div>', unsafe_allow_html=True)
# Sidebar content
model = st.sidebar.selectbox(
"Choose the pretrained model",
["classifierdl_xlm_roberta_sentiment"],
help="For more info about the models visit: https://sparknlp.org/models"
)
# Reference notebook link in sidebar
link = """
<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/SENTIMENT_SW.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
</a>
"""
st.sidebar.markdown('Reference notebook:')
st.sidebar.markdown(link, unsafe_allow_html=True)
# Load examples
examples = [
"Tukio bora katika sinema ilikuwa wakati Gerardo anajaribu kupata wimbo ambao unaendelea kupitia kichwa chake.",
"Ni dharau kwa akili ya mtu na upotezaji mkubwa wa pesa",
"Kris Kristoffersen ni mzuri kwenye sinema hii na kweli hufanya tofauti.",
"Hadithi yenyewe ni ya kutabirika tu na ya uvivu.",
"Ninapendekeza hizi kwa kuwa zinaonekana nzuri sana, kifahari na nzuri",
"Safaricom si muache kucheza na mkopo wa nambari yangu tafadhali. mnanifilisisha๐๐๐ฏ",
"Bidhaa ilikuwa bora na inafanya kazi vizuri kuliko ya verizon na bei ilikuwa rahisi ",
"Siwezi kuona jinsi sinema hii inavyoweza kuwa msukumo kwa mtu yeyote kushinda woga na kukataliwa.",
"Sinema hii inasawazishwa vizuri na vichekesho na mchezo wa kuigiza na nilijifurahisha sana."
]
st.subheader("This model identifies positive or negative sentiments in Swahili texts.")
selected_text = st.selectbox("Select a sample", examples)
custom_input = st.text_input("Try it for yourself!")
if custom_input:
selected_text = custom_input
elif selected_text:
selected_text = selected_text
st.subheader('Selected Text')
st.write(selected_text)
# Initialize Spark and create pipeline
spark = init_spark()
pipeline = create_pipeline(model)
output = fit_data(pipeline, selected_text)
# Display output sentence
if output.lower() in ['pos', 'positive']:
st.markdown("""<h3>This seems like a <span style="color: green">{}</span> text. <span style="font-size:35px;">😃</span></h3>""".format('positive'), unsafe_allow_html=True)
elif output.lower() in ['neg', 'negative']:
st.markdown("""<h3>This seems like a <span style="color: red">{}</span> text. <span style="font-size:35px;">😠</span?</h3>""".format('negative'), unsafe_allow_html=True)
|