import streamlit as st import sparknlp import os import pandas as pd from sparknlp.base import * from sparknlp.annotator import * from pyspark.ml import Pipeline from sparknlp.pretrained import PretrainedPipeline from annotated_text import annotated_text # Page configuration st.set_page_config( layout="wide", initial_sidebar_state="auto" ) # CSS for styling st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def init_spark(): return sparknlp.start() @st.cache_resource def create_pipeline(model): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en") \ .setInputCols(["document"]) \ .setOutputCol("sentence") tokenizer = Tokenizer() \ .setInputCols(["sentence"]) \ .setOutputCol("token") embeddings = WordEmbeddingsModel.pretrained("glove_100d", "en") \ .setInputCols("sentence", "token") \ .setOutputCol("embeddings") ner = NerDLModel.pretrained("nerdl_snips_100d") \ .setInputCols(["sentence", "token", "embeddings"]) \ .setOutputCol("ner") ner_converter = NerConverter() \ .setInputCols(["document", "token", "ner"]) \ .setOutputCol("ner_chunk") pipeline = Pipeline(stages=[ document_assembler, sentence_detector, tokenizer, embeddings, ner, ner_converter ]) return pipeline def fit_data(pipeline, data): empty_df = spark.createDataFrame([['']]).toDF('text') pipeline_model = pipeline.fit(empty_df) model = LightPipeline(pipeline_model) result = model.fullAnnotate(data) return result def annotate(data): document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"] annotated_words = [] for chunk, label in zip(chunks, labels): parts = document.split(chunk, 1) if parts[0]: annotated_words.append(parts[0]) annotated_words.append((chunk, label)) document = parts[1] if document: annotated_words.append(document) annotated_text(*annotated_words) # Set up the page layout st.markdown('
The nerdl_snips_100d model excels at identifying and classifying key entities in user commands related to music, restaurants, and movies. This model provides a structured representation of user intents by detecting and categorizing various entities such as genres, restaurant types, and movie titles.