import streamlit as st import sparknlp import os import pandas as pd from sparknlp.base import * from sparknlp.annotator import * from pyspark.ml import Pipeline from sparknlp.pretrained import PretrainedPipeline # Page configuration st.set_page_config( layout="wide", page_title="Spark NLP Demos App", initial_sidebar_state="auto" ) # CSS for styling st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def init_spark(): return sparknlp.start() @st.cache_resource def create_pipeline(): document = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") # Step 2: Sentence Detection sentenceDetector = SentenceDetector() \ .setInputCols("document") \ .setOutputCol("sentences") # Step 3: Tokenization token = Tokenizer() \ .setInputCols("sentences") \ .setOutputCol("tokens") \ .setContextChars(["(", ")", "?", "!", ".", ","]) # Step 4: Coreference Resolution corefResolution = SpanBertCorefModel().pretrained("spanbert_base_coref") \ .setInputCols(["sentences", "tokens"]) \ .setOutputCol("corefs") \ .setCaseSensitive(False) # Define the pipeline pipeline = Pipeline(stages=[document, sentenceDetector, token, corefResolution]) def fit_data(pipeline, data): empty_df = spark.createDataFrame([['']]).toDF('text') pipeline_model = pipeline.fit(empty_df) model = LightPipeline(pipeline_model) results = model.fullAnnotate(data) return results # Set up the page layout st.markdown('
State-of-the-Art Coreference Resolution in Spark NLP
', unsafe_allow_html=True) # Sidebar content model_name = st.sidebar.selectbox( "Choose the pretrained model", ['spanbert_base_coref'], help="For more info about the models visit: https://sparknlp.org/models" ) # Reference notebook link in sidebar link = """ Open In Colab """ st.sidebar.markdown('Reference notebook:') st.sidebar.markdown(link, unsafe_allow_html=True) # Load examples examples = [ "Alice went to the market. She bought some fresh vegetables there. The tomatoes she purchased were particularly ripe.", "Dr. Smith is a renowned surgeon. He has performed over a thousand successful operations. His colleagues respect him a lot.", "The company announced a new product launch. It is expected to revolutionize the industry. The CEO was very excited about it.", "Jennifer enjoys hiking. She goes to the mountains every weekend. Her favorite spot is the Blue Ridge Mountains.", "The team won the championship. They celebrated their victory with a huge party. Their coach praised their hard work and dedication.", "Michael is studying computer science. He finds artificial intelligence fascinating. His dream is to work at a leading tech company.", "The book was well-received by critics. It was praised for its intricate plot and well-developed characters. The author felt proud of his work.", "Sarah adopted a kitten. She named it Whiskers. Whiskers loves to play with her and often follows her around the house.", "The project was completed ahead of schedule. It was a collaborative effort. The team members were rewarded for their contribution.", "Tom is a skilled guitarist. He plays in a local band. His performances are always energetic and captivating." ] # st.subheader("Automatically detect phrases expressing dates and normalize them with respect to a reference date.") selected_text = st.selectbox("Select an example", examples) custom_input = st.text_input("Try it with your own Sentence!") text_to_analyze = custom_input if custom_input else selected_text st.subheader('Full example text') st.write(text_to_analyze) # Initialize Spark and create pipeline spark = init_spark() pipeline = create_pipeline() output = fit_data(pipeline, text_to_analyze) # Display matched sentence st.subheader("Processed output:") df = extract_to_dataframe(output) df.index += 1 st.dataframe(df)