import streamlit as st import sparknlp import os import pandas as pd from sparknlp.base import * from sparknlp.annotator import * from pyspark.ml import Pipeline from sparknlp.pretrained import PretrainedPipeline from annotated_text import annotated_text # Page configuration st.set_page_config( layout="wide", initial_sidebar_state="auto" ) # CSS for styling st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def init_spark(): return sparknlp.start() @st.cache_resource def create_pipeline(model): documentAssembler = DocumentAssembler()\ .setInputCol("text")\ .setOutputCol("document") sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\ .setInputCols(["document"])\ .setOutputCol("sentence") tokenizer = WordSegmenterModel.pretrained("wordseg_large", "zh") \ .setInputCols(["sentence"]) \ .setOutputCol("token") tokenClassifier = XlmRoBertaForTokenClassification.pretrained("xlm_roberta_large_token_classifier_hrl", "xx")\ .setInputCols(["sentence",'token'])\ .setOutputCol("ner") ner_converter = NerConverter()\ .setInputCols(["sentence", "token", "ner"])\ .setOutputCol("ner_chunk") nlpPipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, tokenClassifier, ner_converter]) return nlpPipeline def fit_data(pipeline, data): empty_df = spark.createDataFrame([['']]).toDF('text') pipeline_model = pipeline.fit(empty_df) model = LightPipeline(pipeline_model) result = model.fullAnnotate(data) return result def annotate(data): document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"] annotated_words = [] for chunk, label in zip(chunks, labels): parts = document.split(chunk, 1) if parts[0]: annotated_words.append(parts[0]) annotated_words.append((chunk, label)) document = parts[1] if document: annotated_words.append(document) annotated_text(*annotated_words) # Set up the page layout st.markdown('

Recognize entities in Chinese text

', unsafe_allow_html=True) # Sidebar content model = st.sidebar.selectbox( "Choose the pretrained model", ["xlm_roberta_large_token_classifier_hrl"], help="For more info about the models visit: https://sparknlp.org/models" ) # Reference notebook link in sidebar link = """

""" st.sidebar.markdown('Reference notebook:') st.sidebar.markdown(link, unsafe_allow_html=True) # Load examples examples = [ "当前，在中共十五大精神的指引下，在以江泽民同志为核心的中共中央领导下，全党和全国各族人民正高举邓小平理论伟大旗帜，同心同德，团结奋斗，沿着建设有中国特色的社会主义道路阔步前进。", "中共中央致中国致公党十一大的贺词各位代表、各位同志：在中国致公党第十一次全国代表大会隆重召开之际，中国共产党中央委员会谨向大会表示热烈的祝贺，向致公党的同志们致以亲切的问候！", "数百名华人、华侨、留学人员、我国驻纽约总领事馆代表在机场挥舞中美两国国旗，热烈欢迎江主席访问波士顿。", "到机场迎接江主席的美方人员有马萨诸塞州州长和波士顿市长等。", "又讯中国国家主席江泽民１日上午应邀在美国著名学府哈佛大学发表重要演讲。", "江主席来到哈佛大学时，受到哈佛大学校长陆登庭及哈佛各学院院长的热烈欢迎。", "本报纽约１０月３１日电记者陈特安、周德武报道：今天晚上，美中贸易全国委员会和美国中国商会在纽约举行盛大宴会欢迎江泽民主席。", "哈佛大学校长陆登庭对江主席访问哈佛并发表演讲表示欢迎。", "美中贸易全国委员会主席费希尔和美国中国商会会长沈被章先后致词。" ] selected_text = st.selectbox("Select an example", examples) custom_input = st.text_input("Try it with your own Sentence!") text_to_analyze = custom_input if custom_input else selected_text st.subheader('Full example text') HTML_WRAPPER = """

{}

""" st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True) # Initialize Spark and create pipeline spark = init_spark() pipeline = create_pipeline(model) output = fit_data(pipeline, text_to_analyze) # Display matched sentence st.subheader("Processed output:") results = { 'Document': output[0]['document'][0].result, 'NER Chunk': [n.result for n in output[0]['ner_chunk']], "NER Label": [n.metadata['entity'] for n in output[0]['ner_chunk']] } annotate(results) with st.expander("View DataFrame"): df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']}) df.index += 1 st.dataframe(df)