Hebrew-NER / Demo.py
abdullahmubeen10's picture
Upload 10 files
6362d22 verified
import streamlit as st
import sparknlp
import os
import pandas as pd
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline
from annotated_text import annotated_text
# Page configuration
st.set_page_config(
layout="wide",
initial_sidebar_state="auto"
)
# CSS for styling
st.markdown("""
<style>
.main-title {
font-size: 36px;
color: #4A90E2;
font-weight: bold;
text-align: center;
}
.section {
background-color: #f9f9f9;
padding: 10px;
border-radius: 10px;
margin-top: 10px;
}
.section p, .section ul {
color: #666666;
}
</style>
""", unsafe_allow_html=True)
@st.cache_resource
def init_spark():
return sparknlp.start()
@st.cache_resource
def create_pipeline(model):
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
sentence_detector = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence")
tokenizer = Tokenizer() \
.setInputCols(["sentence"]) \
.setOutputCol("token")
word_embeddings = WordEmbeddingsModel.pretrained("hebrew_cc_300d", "he") \
.setInputCols(["sentence", "token"]) \
.setOutputCol("embeddings")
ner = NerDLModel.pretrained("hebrewner_cc_300d", "he") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner")
ner_converter = NerConverter().setInputCols(["sentence", "token", "ner"]).setOutputCol("ner_chunk")
pipeline = Pipeline(stages=[documentAssembler, sentence_detector, tokenizer, word_embeddings, ner, ner_converter])
return pipeline
def fit_data(pipeline, data):
empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = pipeline.fit(empty_df)
model = LightPipeline(pipeline_model)
result = model.fullAnnotate(data)
return result
def annotate(data):
document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
annotated_words = []
for chunk, label in zip(chunks, labels):
parts = document.split(chunk, 1)
if parts[0]:
annotated_words.append(parts[0])
annotated_words.append((chunk, label))
document = parts[1]
if document:
annotated_words.append(document)
annotated_text(*annotated_words)
# Set up the page layout
st.markdown('<div class="main-title">Recognize entities in Persian text</div>', unsafe_allow_html=True)
st.markdown("""
<div class="section">
<p>Named Entity Recognition (NER) models identify and categorize important entities in a text. This page details a word embeddings-based NER model for Hebrew texts, using the <code>hebrew_cc_300d</code> word embeddings. The model is pretrained and available for use with Spark NLP.</p>
</div>
""", unsafe_allow_html=True)
# Sidebar content
model = st.sidebar.selectbox(
"Choose the pretrained model",
["hebrewner_cc_300d"],
help="For more info about the models visit: https://sparknlp.org/models"
)
# Reference notebook link in sidebar
link = """
<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/public/NER_HE.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
</a>
"""
st.sidebar.markdown('Reference notebook:')
st.sidebar.markdown(link, unsafe_allow_html=True)
# Load examples
examples = [
"""ื•ื”ืชื•ืฆืื” : ืกืคืจื• ื”ืคืš ืœืจื‘ ืžื›ืจ ืขื ืง ื•ื‘ืกื™ืก ืœื•ื•ื™ื›ื•ื—ื™ื ืชื™ืื•ืœื•ื’ื™ื™ื ื•ื“ื™ื•ื ื™ื ื ื–ืขืžื™ื , ื›ืžื• ื’ื ื”ืชืงืคื•ืช ื•ื”ืืฉืžื•ืช ื›ืœืคื™ ื‘ืจืื•ืŸ ืžื—ื•ื’ื™ ื”ื›ื ืกื™ื™ื” ื›ืคื™ ืฉืžืขื•ืœื ืœื ื”ืชืขื•ืจืจื• ื›ืชื•ืฆืื” ืžืกืคืจื™ื”ื ืฉืœ ื•ื•ืืœืืก ืื• ืœืื“ืœื•ื , ื•ืืฃ ื’ืจื ืœืกื•ืคืจ ืžืฆืœื™ื— ื‘ื–ื›ื•ืช ืขืฆืžื• , ื“ืŸ ื‘ื•ืจืกื˜ื™ืŸ , ืœืขืจื•ืš ืืช ื”ืกืคืจ " ื”ืกื•ื“ื•ืช ืฉืžืื—ื•ืจื™ ืฆื•ืคืŸ ื“ื” ื•ื™ื ืฆ'ื™ " , ืฉื‘ื• ื”ื•ื ื‘ื•ื“ืง ืื—ืช ืœืื—ืช ืืช ื”ืขื•ื‘ื“ื•ืช ื•ื”ื”ื ื—ื•ืช ืฉืขืœื™ื”ืŸ ืžืกืชืžืš ื‘ืจืื•ืŸ ืขืœ ื™ื“ื™ ืฉืคืข ืฉืœ ืžืืžืจื™ื , ื—ืœืงื ืžืงื•ืจื™ื™ื ื•ื—ืœืงื ืœืงื•ื—ื™ื ืžืกืคืจื™ื , ื›ืชื‘ื™ ืขืช ื•ืจืื™ื•ื ื•ืช ืขื ื—ื•ืงืจื™ื ืฉื•ื ื™ื .""",
"""ื‘ื’ืœืœ ืงื•ืฆืจ ื”ื™ืจื™ืขื” ืœื ื ืชืขืกืง ื›ืืŸ ื‘ื›ืœ ื”ื ื•ืฉืื™ื ื”ืžื’ื•ื•ื ื™ื ืฉื‘ื”ื ื“ืŸ ื”ืกืคืจ , ื›ืžื• ืœืžืฉืœ ื“ืžื•ืชื” ืฉืœ ืžืจื™ื ื”ืžื’ื“ืœื™ืช , ื”ื“ืขื•ืช ื”ืื–ื•ื˜ืจื™ื•ืช ืฉืœ ืœื™ืื•ื ืจื“ื• ื“ื” ื•ื™ื ืฆื™ ื•ื›ืŸ ื”ืœืื” , ืืœื ื ืชืžืงื“ ื‘ื ื•ืฉื ืื—ื“ - ื‘ืื’ื•ื“ืช ื”ืกืชืจ " ืžืกื“ืจ ืฆื™ื•ืŸ " - ืžืกื“ืจ ื—ืฉืื™ ื”ืงื™ื™ื ื›ื‘ื™ื›ื•ืœ ืžื–ื” ืืœืฃ ืฉื ื” , ื•ืชืคืงื™ื“ื• ืœื”ื’ืŸ ืขืœ ืฆืืฆืื™ ื”ืฉื•ืฉืœืช ื”ืž ึถืจื•ื‘ ึผื™ื ื’ื™ืช ื”ืงื“ื•ืžื” ืฉืœ ืฆืจืคืช , ืฉื”ื ืœืžืขืฉื” ืฆืืฆืื™ ื™ืฉื•ืข ื•ืžืจื™ื ื”ืžื’ื“ืœื™ืช , ื•ืœืคื™ื›ืš ื”ื , ืœื“ืขืช ื—ื‘ืจื™ ื”ืžืกื“ืจ , ื”ืฉื•ืฉืœืช ื”ืžืœื›ื•ืชื™ืช ื”ืœื’ื™ื˜ื™ืžื™ืช ืฉืœ ืฆืจืคืช , ืžื” ืฉืื•ืžืจ ื›ืžื•ื‘ืŸ ืฉืžืœื›ื™ ืฆืจืคืช ื”ื ืžืžื•ืฆื ื™ื”ื•ื“ื™ .""",
"""ื‘ 32 ื‘ืื•ืงื˜ื•ื‘ืจ ื”ืชืคืขืœื” ืžืžื ื• ื‘ืขืœืช ื˜ื•ืจ ื‘ืขื™ืชื•ืŸ " ื‘ื•ืกื˜ื•ืŸ ื’ืœื•ื‘ " ื‘ืžืœื™ื ื”ื™ืื•ืช ืœืžืขืจื™ืฆื” ื‘ืช 21 : " ื”ื•ื ืขืฉื” ื‘ื—ื•ื“ืฉื™ื ืื—ื“ื™ื ืœืžืขืŸ ืฆื—ื•ืช ื”ื“ื™ื‘ื•ืจ ืžื” ืฉืœืงื— ืœื—ื‘ืจื” ืฉื ื™ื ื›ื“ื™ ืœืขืฉื•ืช ืœืžืขืŸ ื˜ืœื•ื•ื™ื–ื™ื” ืฆื‘ืขื•ื ื™ืช ... ืื ื“ื™ื‘ื•ืจ ื”ื™ื” ืกืคื•ืจื˜ ืื•ืœื™ืžืคื™ , ื”ื•ื ื”ื™ื” ื–ื•ื›ื” ื‘ืžื“ืœื™ื™ืช ื”ื–ื”ื‘ ... ืกื™ืœื‘ืจ ื›ื” ื˜ื•ื‘ , ืขื“ ืฉื”ื•ื ื’ื•ืจื ืœืื ื’ืœื™ืช ืœื”ื™ืฉืžืข ื›ืžื• ืฆืจืคืชื™ืช ... ืื ื™ื™ื‘ื—ืจ , ืชื”ื™ื” ืœื›ื•ืœื ื• ื”ื”ื–ื“ืžื ื•ืช ืœืœืžื•ื“ ืžืžื ื• ืœื”ื™ื•ืช ืกื˜ื•ื“ื ื˜ื™ื ื‘ื›ื™ืชืชื• ื”ืขื ืงื™ืช , ื”ื ืงืจืืช ืžืกืฆื•ืกื˜ืก " .""",
"""ืœื ืžื™ื ื” ื•ืœื ืžืงืฆืชื” ! ื”ืจื™ ืฉื ืกื™ืคืจืชื™ ืขืœ ื”ื”ื’ืขื” ื‘ืงืจื•ื ื•ืช ื”ื—ื ืง , ืขืœ ื”ืžืชื™ื ืฉื˜ื•ืื˜ืื• ืžื”ืงืจื•ื ื•ืช , ืขืœ " ืงื•ืžื ื“ื• ืงื ื“ื” " , ืขืœ ืื ืฉื™ ื”ืก"ืก ื•ื›ืœื‘ื™ื”ื ื”ืืžืชื ื™ื™ื , ืขืœ ืืœื•ืžื•ืช ื”ืื•ืจ ืžื ืงืจื•ืช ื”ืขื™ื ื™ื™ื ืฉืฉืœื—ื• ื”ื–ืจืงื•ืจื™ื , ืขืœ ื‘ื›ื™ ื™ืœื“ื™ื ืฉื ืงืจืขื• ืžื–ืจื•ืขื•ืช ืืžื•ืชื™ื”ื , ื•ืœืขืชื™ื ื ืฉืืจื• ื”ืืžื”ื•ืช ื”ืฆืขื™ืจื•ืช ื‘ื—ื™ื™ื , ื•ืืชื” ืžื•ืชื™ืจ ืจืง ืžืœื™ื ื‘ื•ื“ื“ื•ืช ืขืœ ื”"ืกืœืงืฆื™ื” " .""",
"""ืฉื•ื•ื™ื“ ื—ื•ืฉืฃ ืืช ืชืžื—ื•ืจื™ ื”ืžื•ืฆืจื™ื ื”ื™ืฆื™ื‘ื™ื ืฉืœ ื”ื—ื‘ืจื”: " ื”ืžื—ื™ืจื™ื ื ื•ืชืจื• ื–ื”ื™ื : 70 ื“ื•ืœืจ ืœืื‘ื˜ื—ืช ืขืกืง ืงื˜ืŸ , 300 ื“ื•ืœืจ ืœืื‘ื˜ื—ืช ืจืฉืช ื‘ืขืกืง ืงื˜ืŸ , ื‘ื™ืŸ 1,500 ืœ - 3,500 ื“ื•ืœืจ ืœืื‘ื˜ื—ืช ื—ื‘ืจื•ืช ื’ื“ื•ืœื•ืช ืขื ืืชืจ ืจืืฉื™ ื•ืขื“ 500 ืžื—ืฉื‘ื™ื , ื‘ืืžืฆืขื•ืช ืžื•ืฆืจื™ ื”ืฆ'ืง ืคื•ื™ื ื˜ ืืงืกืคืจืก , ื•ื‘ื™ืŸ 15,000 ืœ - 20,000 ื“ื•ืœืจ ืœืขืกืง ืขื 3 ืขื“ 4 ืืชืจื™ื , ื—ื‘ืจื•ืช ื’ื“ื•ืœื•ืช ืขื ืžื—ื–ื•ืจื™ ืžื›ื™ืจื•ืช ืžืฉืžืขื•ืชื™ื™ื ."""
]
selected_text = st.selectbox("Select an example", examples)
custom_input = st.text_input("Try it with your own Sentence!")
text_to_analyze = custom_input if custom_input else selected_text
st.subheader('Full example text')
HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
# Initialize Spark and create pipeline
spark = init_spark()
pipeline = create_pipeline(model)
output = fit_data(pipeline, text_to_analyze)
# Display matched sentence
st.subheader("Processed output:")
results = {
'Document': output[0]['document'][0].result,
'NER Chunk': [n.result for n in output[0]['ner_chunk']],
"NER Label": [n.metadata['entity'] for n in output[0]['ner_chunk']]
}
annotate(results)
with st.expander("View DataFrame"):
df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']})
df.index += 1
st.dataframe(df)