import streamlit as st import sparknlp import os import pandas as pd from sparknlp.base import * from sparknlp.annotator import * from pyspark.ml import Pipeline from sparknlp.pretrained import PretrainedPipeline from annotated_text import annotated_text # Page configuration st.set_page_config( layout="wide", page_title="Spark NLP Demos App", initial_sidebar_state="auto" ) # CSS for styling st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def init_spark(): return sparknlp.start() @st.cache_resource def create_pipeline(model): document_assembler = DocumentAssembler() \ .setInputCol('text') \ .setOutputCol('document') sentence_detector = SentenceDetector() \ .setInputCols(['document']) \ .setOutputCol('sentences') tokenizer = Tokenizer() \ .setInputCols(['sentences']) \ .setOutputCol('tokens') \ .setContextChars(['(', ')', '?', '!', '.', ',']) keywords = YakeKeywordExtraction() \ .setInputCols('tokens') \ .setOutputCol('keywords') \ .setMinNGrams(2) \ .setMaxNGrams(5) \ .setNKeywords(100) \ .setStopWords(StopWordsCleaner().getStopWords()) pipeline = Pipeline(stages=[ document_assembler, sentence_detector, tokenizer, keywords ]) return pipeline def fit_data(pipeline, data): empty_df = spark.createDataFrame([['']]).toDF('text') pipeline_model = pipeline.fit(empty_df) model = LightPipeline(pipeline_model) results = model.fullAnnotate(data)[0] return results def highlight_keywords(data): document_text = data["document"][0].result keywords = data["keywords"] annotations = [] last_index = 0 for keyword in keywords: keyword_text = keyword.result start_index = document_text.find(keyword_text, last_index) if start_index != -1: if start_index > last_index: annotations.append(document_text[last_index:start_index]) annotations.append((keyword_text, 'Key Word')) last_index = start_index + len(keyword_text) if last_index < len(document_text): annotations.append(document_text[last_index:]) annotated_text(*annotations) # Set up the page layout st.markdown('
Detect Key Phrases With Spark NLP
', unsafe_allow_html=True) # Sidebar content model = st.sidebar.selectbox( "Choose the pretrained model", ["yake_model"], help="For more info about the models visit: https://sparknlp.org/models" ) # Reference notebook link in sidebar link = """ Open In Colab """ st.sidebar.markdown('Reference notebook:') st.sidebar.markdown(link, unsafe_allow_html=True) # Load examples folder_path = f"inputs/{model}" examples = [ lines[1].strip() for filename in os.listdir(folder_path) if filename.endswith('.txt') for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()] if len(lines) >= 2 ] selected_text = st.selectbox("Select a sample text", examples) custom_input = st.text_input("Try it for yourself!") if custom_input: selected_text = custom_input elif selected_text: selected_text = selected_text st.subheader('Selected Text') st.write(selected_text) # Initialize Spark and create pipeline spark = init_spark() pipeline = create_pipeline(model) output = fit_data(pipeline, selected_text) # Display output st.subheader("Annotated Document:") highlight_keywords(output) keys_df = pd.DataFrame([(k.result, k.begin, k.end, k.metadata['score'], k.metadata['sentence']) for k in output['keywords']], columns=['keywords', 'begin', 'end', 'score', 'sentence']) keys_df['score'] = keys_df['score'].astype(float) # ordered by relevance with st.expander("View Data Table"): st.table(keys_df.sort_values(['sentence', 'score']))