import streamlit as st import sparknlp import os import pandas as pd from sparknlp.base import * from sparknlp.annotator import * from pyspark.ml import Pipeline from sparknlp.pretrained import PretrainedPipeline from annotated_text import annotated_text # Page configuration st.set_page_config( layout="wide", initial_sidebar_state="auto" ) # CSS for styling st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def init_spark(): return sparknlp.start() @st.cache_resource def create_pipeline(model): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") sentence_detector = SentenceDetector() \ .setInputCols(["document"]) \ .setOutputCol("sentence") tokenizer = Tokenizer() \ .setInputCols(["sentence"]) \ .setOutputCol("token") embeddings = WordEmbeddingsModel.pretrained("glove_100d", "en") \ .setInputCols("sentence", "token") \ .setOutputCol("embeddings") ner_restaurant = NerDLModel.pretrained("nerdl_restaurant_100d", "en") \ .setInputCols(["sentence", "token", "embeddings"]) \ .setOutputCol("ner") ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ .setOutputCol("ner_chunk") pipeline = Pipeline(stages=[ document_assembler, sentence_detector, tokenizer, embeddings, ner_restaurant, ner_converter ]) return pipeline def fit_data(pipeline, data): empty_df = spark.createDataFrame([['']]).toDF('text') pipeline_model = pipeline.fit(empty_df) model = LightPipeline(pipeline_model) result = model.fullAnnotate(data) return result def annotate(data): document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"] annotated_words = [] for chunk, label in zip(chunks, labels): parts = document.split(chunk, 1) if parts[0]: annotated_words.append(parts[0]) annotated_words.append((chunk, label)) document = parts[1] if document: annotated_words.append(document) annotated_text(*annotated_words) # Sidebar content model = st.sidebar.selectbox( "Choose the pretrained model", ["nerdl_restaurant_100d"], help="For more info about the models visit: https://sparknlp.org/models" ) # Set up the page layout title, sub_title = ('Detect Restaurant Terminology', 'This app utilizes the nerdl_restaurant_100d model, which is trained with GloVe 100d embeddings to detect restaurant-related terminology. The model is tailored specifically for identifying various aspects related to restaurants, such as locations, cuisines, and dish names.') st.markdown(f'

{title}

', unsafe_allow_html=True) st.markdown(f'

{sub_title}

', unsafe_allow_html=True) # Reference notebook link in sidebar link = """

""" st.sidebar.markdown('Reference notebook:') st.sidebar.markdown(link, unsafe_allow_html=True) # Load examples folder_path = f"inputs/{model}" examples = [ lines[1].strip() for filename in os.listdir(folder_path) if filename.endswith('.txt') for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()] if len(lines) >= 2 ] selected_text = st.selectbox("Select an example", examples) custom_input = st.text_input("Try it with your own Sentence!") text_to_analyze = custom_input if custom_input else selected_text st.subheader('Full example text') HTML_WRAPPER = """

{}

""" st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True) # Initialize Spark and create pipeline spark = init_spark() pipeline = create_pipeline(model) output = fit_data(pipeline, text_to_analyze) # Display matched sentence st.subheader("Processed output:") results = { 'Document': output[0]['document'][0].result, 'NER Chunk': [n.result for n in output[0]['ner_chunk']], "NER Label": [n.metadata['entity'] for n in output[0]['ner_chunk']] } annotate(results) with st.expander("View DataFrame"): df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']}) df.index += 1 st.dataframe(df)