abdullahmubeen10 commited on
Commit
9676abf
Β·
verified Β·
1 Parent(s): 7a3dd83

Upload 15 files

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ primaryColor="#29B4E8"
Demo.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ import pandas as pd
5
+
6
+ from sparknlp.base import *
7
+ from sparknlp.annotator import *
8
+ from pyspark.ml import Pipeline
9
+ from sparknlp.pretrained import PretrainedPipeline
10
+ from annotated_text import annotated_text
11
+
12
+ # Page configuration
13
+ st.set_page_config(
14
+ layout="wide",
15
+ initial_sidebar_state="auto"
16
+ )
17
+
18
+ # CSS for styling
19
+ st.markdown("""
20
+ <style>
21
+ .main-title {
22
+ font-size: 36px;
23
+ color: #4A90E2;
24
+ font-weight: bold;
25
+ text-align: center;
26
+ }
27
+ .section {
28
+ background-color: #f9f9f9;
29
+ padding: 10px;
30
+ border-radius: 10px;
31
+ margin-top: 10px;
32
+ }
33
+ .section p, .section ul {
34
+ color: #666666;
35
+ }
36
+ </style>
37
+ """, unsafe_allow_html=True)
38
+
39
+ @st.cache_resource
40
+ def init_spark():
41
+ return sparknlp.start()
42
+
43
+ @st.cache_resource
44
+ def create_pipeline(model):
45
+ document_assembler = DocumentAssembler() \
46
+ .setInputCol("text") \
47
+ .setOutputCol("document")
48
+
49
+ sentence_detector = SentenceDetector() \
50
+ .setInputCols(["document"]) \
51
+ .setOutputCol("sentence")
52
+
53
+ tokenizer = Tokenizer() \
54
+ .setInputCols(["sentence"]) \
55
+ .setOutputCol("token")
56
+
57
+ embeddings = WordEmbeddingsModel.pretrained("glove_100d", "en")\
58
+ .setInputCols("sentence", "token") \
59
+ .setOutputCol("embeddings")
60
+
61
+ ner_tagger = NerDLModel.pretrained("nerdl_fewnerd_subentity_100d", "en") \
62
+ .setInputCols(['sentence', 'token', 'embeddings']) \
63
+ .setOutputCol("ner")
64
+
65
+ ner_converter = NerConverter() \
66
+ .setInputCols(["document", "token", "ner"]) \
67
+ .setOutputCol("ner_chunk")
68
+
69
+ pipeline = Pipeline(stages=[
70
+ document_assembler,
71
+ sentence_detector,
72
+ tokenizer,
73
+ embeddings,
74
+ ner_tagger,
75
+ ner_converter
76
+ ])
77
+ return pipeline
78
+
79
+ def fit_data(pipeline, data):
80
+ empty_df = spark.createDataFrame([['']]).toDF('text')
81
+ pipeline_model = pipeline.fit(empty_df)
82
+ model = LightPipeline(pipeline_model)
83
+ result = model.fullAnnotate(data)
84
+ return result
85
+
86
+ def annotate(data):
87
+ document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
88
+ annotated_words = []
89
+ for chunk, label in zip(chunks, labels):
90
+ parts = document.split(chunk, 1)
91
+ if parts[0]:
92
+ annotated_words.append(parts[0])
93
+ annotated_words.append((chunk, label))
94
+ document = parts[1]
95
+ if document:
96
+ annotated_words.append(document)
97
+ annotated_text(*annotated_words)
98
+
99
+ # Sidebar content
100
+ model = st.sidebar.selectbox(
101
+ "Choose the pretrained model",
102
+ ["nerdl_fewnerd_subentity_100d", "nerdl_fewnerd_100d"],
103
+ help="For more info about the models visit: https://sparknlp.org/models"
104
+ )
105
+
106
+ # Set up the page layout
107
+ title, sub_title = ("Detect 60+ General Entities", "This model is trained on Few-NERD/inter public dataset and it extracts 66 entities that are in general scope.") if model == "nerdl_fewnerd_subentity_100d" else ("Detect 8 General Entities", "This model is trained on Few-NERD/inter public dataset and it extracts 8 entities that are in general scope. The Predicted Entities are:")
108
+
109
+ st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
110
+ st.markdown(f'<div class="section"><p>{sub_title}</p></div>', unsafe_allow_html=True)
111
+
112
+ # Reference notebook link in sidebar
113
+ link = """
114
+ <a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/NER_FewNERD.ipynb">
115
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
116
+ </a>
117
+ """
118
+ st.sidebar.markdown('Reference notebook:')
119
+ st.sidebar.markdown(link, unsafe_allow_html=True)
120
+
121
+ # Load examples
122
+ folder_path = f"inputs/{model}"
123
+ examples = [
124
+ lines[1].strip()
125
+ for filename in os.listdir(folder_path)
126
+ if filename.endswith('.txt')
127
+ for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()]
128
+ if len(lines) >= 2
129
+ ]
130
+
131
+ selected_text = st.selectbox("Select an example", examples)
132
+ custom_input = st.text_input("Try it with your own Sentence!")
133
+
134
+ text_to_analyze = custom_input if custom_input else selected_text
135
+
136
+ st.subheader('Full example text')
137
+ HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
138
+ st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
139
+
140
+ # Initialize Spark and create pipeline
141
+ spark = init_spark()
142
+ pipeline = create_pipeline(model)
143
+ output = fit_data(pipeline, text_to_analyze)
144
+
145
+ # Display matched sentence
146
+ st.subheader("Processed output:")
147
+
148
+ results = {
149
+ 'Document': output[0]['document'][0].result,
150
+ 'NER Chunk': [n.result for n in output[0]['ner_chunk']],
151
+ "NER Label": [n.metadata['entity'] for n in output[0]['ner_chunk']]
152
+ }
153
+
154
+ annotate(results)
155
+
156
+ with st.expander("View DataFrame"):
157
+ df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']})
158
+ df.index += 1
159
+ st.dataframe(df)
Dockerfile ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Download base image ubuntu 18.04
2
+ FROM ubuntu:18.04
3
+
4
+ # Set environment variables
5
+ ENV NB_USER jovyan
6
+ ENV NB_UID 1000
7
+ ENV HOME /home/${NB_USER}
8
+
9
+ # Install required packages
10
+ RUN apt-get update && apt-get install -y \
11
+ tar \
12
+ wget \
13
+ bash \
14
+ rsync \
15
+ gcc \
16
+ libfreetype6-dev \
17
+ libhdf5-serial-dev \
18
+ libpng-dev \
19
+ libzmq3-dev \
20
+ python3 \
21
+ python3-dev \
22
+ python3-pip \
23
+ unzip \
24
+ pkg-config \
25
+ software-properties-common \
26
+ graphviz \
27
+ openjdk-8-jdk \
28
+ ant \
29
+ ca-certificates-java \
30
+ && apt-get clean \
31
+ && update-ca-certificates -f;
32
+
33
+ # Install Python 3.8 and pip
34
+ RUN add-apt-repository ppa:deadsnakes/ppa \
35
+ && apt-get update \
36
+ && apt-get install -y python3.8 python3-pip \
37
+ && apt-get clean;
38
+
39
+ # Set up JAVA_HOME
40
+ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
41
+ RUN mkdir -p ${HOME} \
42
+ && echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
43
+ && chown -R ${NB_UID}:${NB_UID} ${HOME}
44
+
45
+ # Create a new user named "jovyan" with user ID 1000
46
+ RUN useradd -m -u ${NB_UID} ${NB_USER}
47
+
48
+ # Switch to the "jovyan" user
49
+ USER ${NB_USER}
50
+
51
+ # Set home and path variables for the user
52
+ ENV HOME=/home/${NB_USER} \
53
+ PATH=/home/${NB_USER}/.local/bin:$PATH
54
+
55
+ # Set the working directory to the user's home directory
56
+ WORKDIR ${HOME}
57
+
58
+ # Upgrade pip and install Python dependencies
59
+ RUN python3.8 -m pip install --upgrade pip
60
+ COPY requirements.txt /tmp/requirements.txt
61
+ RUN python3.8 -m pip install -r /tmp/requirements.txt
62
+
63
+ # Copy the application code into the container at /home/jovyan
64
+ COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
65
+
66
+ # Expose port for Streamlit
67
+ EXPOSE 7860
68
+
69
+ # Define the entry point for the container
70
+ ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
inputs/nerdl_fewnerd_100d/Example1.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ The Double Down is a sandwich offered by Kentucky Fried Chicken (KFC) restaurants. He did not see active service again until 1882 ...
2
+ The Double Down is a sandwich offered by Kentucky Fried Chicken (KFC) restaurants. He did not see active service again until 1882, when he took part in the Anglo-Egyptian War, and was present at the battle of Tell El Kebir (September 1882), for which he was mentioned in dispatches, received the Egypt Medal with clasp and the 3rd class of the Order of Medjidie, and was appointed a Companion of the Order of the Bath (CB).
inputs/nerdl_fewnerd_100d/Example2.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Mr. Siniora said the latest figures show 1.8 million people are in need of food assistance -
2
+ Mr. Siniora said the latest figures show 1.8 million people are in need of food assistance - with the need greatest in Indonesia , Sri Lanka , the Maldives and India .
inputs/nerdl_fewnerd_100d/Example3.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ SpaceShipOne designer Mr. Haarde accepted the Ansari X Prize money and a trophy on behalf of his ..
2
+ SpaceShipOne designer Mr. Haarde accepted the Ansari X Prize money and a trophy on behalf of his team Saturday during an awards ceremony in the U.S. state of Missouri .
inputs/nerdl_fewnerd_100d/Example4.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ For the last four weeks a team led by former UN Secretary-General Kofi Annan has been ...
2
+ For the last four weeks a team led by former UN Secretary-General Kofi Annan has been trying to broker a deal between the Kenyan government of President Mwai Kibaki and the opposition led by Raila Odinga .
inputs/nerdl_fewnerd_100d/Example5.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Expected attendees or speakers include British Prime Minister Tony Blair , U.N...
2
+ Expected attendees or speakers include British Prime Minister Tony Blair , U.N. Secretary General Kofi Annan and Israel 's Deputy Prime Minister Shimon Peres .
inputs/nerdl_fewnerd_subentity_100d/Example1.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 12 Corazones ('12 Hearts') is Spanish-language dating game show produced in the United States for ...
2
+ 12 Corazones ('12 Hearts') is Spanish-language dating game show produced in the United States for the television network Telemundo since January 2005, based on its namesake Argentine TV show format. The show is filmed in Los Angeles and revolves around the twelve Zodiac signs that identify each contestant. In 2008, Ho filmed a cameo in the Steven Spielberg feature film The Cloverfield Paradox, as a news pundit.
inputs/nerdl_fewnerd_subentity_100d/Example2.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ India were captained by Sunil Gavaskar and Sri Lanka by Anura Tennekoon .It was first test series win ...
2
+ India were captained by Sunil Gavaskar and Sri Lanka by Anura Tennekoon .It was first test series win for India at home against West Indies. Irwin , as the Super Destroyer , returned to Mid-South on April 18 , 1981 , and won the tag team championship with the Grappler after defeating Dick Murdoch and the Junkyard Dog in the finals of a tournament to claim the vacant title .
inputs/nerdl_fewnerd_subentity_100d/Example3.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Corletts first professional acting role was in 2010 film The Devils Double directed by New Zealand ...
2
+ Corletts first professional acting role was in 2010 film The Devils Double directed by New Zealand director Lee Tamahori. Darko Tresnjak is a theatre and opera director born in Zemun , who won four Tony Awards in 2014 .
inputs/nerdl_fewnerd_subentity_100d/Example4.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ As the head of the Nirmul Committee , which he founded in 1992 to work for prosecution of those responsible ...
2
+ As the head of the Nirmul Committee , which he founded in 1992 to work for prosecution of those responsible for genocide and other war crimes during the Bangladesh War of Independence in 1971 , Kabir has continued to take an active role . As weighed growth rates , economic planning performed very well during the early and mid-1930s , World War II -era mobilization , and for the first two decades of the postwar era .
inputs/nerdl_fewnerd_subentity_100d/Example5.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ In 1340 , King Edward III of England claimed the throne of France and started the Hundred Years War ...
2
+ In 1340 , King Edward III of England claimed the throne of France and started the Hundred Years War , marked by two famous battles , that of CrΓ©cy 1346 and that of Battle of Agincourt , 1415 , where Robert Wavrin , Lord of Lillers , met his death .
pages/Workflow & Model Overview.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Custom CSS for better styling
4
+ st.markdown("""
5
+ <style>
6
+ .main-title {
7
+ font-size: 36px;
8
+ color: #4A90E2;
9
+ font-weight: bold;
10
+ text-align: center;
11
+ }
12
+ .sub-title {
13
+ font-size: 24px;
14
+ color: #4A90E2;
15
+ margin-top: 20px;
16
+ }
17
+ .section {
18
+ background-color: #f9f9f9;
19
+ padding: 15px;
20
+ border-radius: 10px;
21
+ margin-top: 20px;
22
+ }
23
+ .section h2 {
24
+ font-size: 22px;
25
+ color: #4A90E2;
26
+ }
27
+ .section p, .section ul {
28
+ color: #666666;
29
+ }
30
+ .link {
31
+ color: #4A90E2;
32
+ text-decoration: none;
33
+ }
34
+ .benchmark-table {
35
+ width: 100%;
36
+ border-collapse: collapse;
37
+ margin-top: 20px;
38
+ }
39
+ .benchmark-table th, .benchmark-table td {
40
+ border: 1px solid #ddd;
41
+ padding: 8px;
42
+ text-align: left;
43
+ }
44
+ .benchmark-table th {
45
+ background-color: #4A90E2;
46
+ color: white;
47
+ }
48
+ .benchmark-table td {
49
+ background-color: #f2f2f2;
50
+ }
51
+ </style>
52
+ """, unsafe_allow_html=True)
53
+
54
+ # Main Title
55
+ st.markdown('<div class="main-title">Detect Entities (66-labeled) in General Scope</div>', unsafe_allow_html=True)
56
+
57
+ # Description
58
+ st.markdown("""
59
+ <div class="section">
60
+ <p>This app utilizes the <strong>nerdl_fewnerd_subentity_100d</strong> model, which is trained on the Few-NERD/inter public dataset to detect 66 entities with high accuracy. The model is based on 100d GloVe embeddings, ensuring robust entity detection.</p>
61
+ </div>
62
+ """, unsafe_allow_html=True)
63
+
64
+ # What is Entity Recognition
65
+ st.markdown('<div class="sub-title">What is Entity Recognition?</div>', unsafe_allow_html=True)
66
+ st.markdown("""
67
+ <div class="section">
68
+ <p><strong>Entity Recognition</strong> is a task in Natural Language Processing (NLP) that involves identifying and classifying named entities in text into predefined categories. For general texts, this model focuses on detecting a wide range of entities, which are crucial for understanding and analyzing diverse content.</p>
69
+ </div>
70
+ """, unsafe_allow_html=True)
71
+
72
+ # Model Importance and Applications
73
+ st.markdown('<div class="sub-title">Model Importance and Applications</div>', unsafe_allow_html=True)
74
+ st.markdown("""
75
+ <div class="section">
76
+ <p>The <strong>nerdl_fewnerd_subentity_100d</strong> model is highly effective for extracting named entities from general texts. Its applications include:</p>
77
+ <ul>
78
+ <li><strong>Text Analysis:</strong> The model can be used to identify and categorize mentions of a wide variety of entities in text documents, which is valuable for text mining and information retrieval.</li>
79
+ <li><strong>Content Classification:</strong> By recognizing key entities, the model helps in categorizing content based on entity types, which is useful for organizing and filtering large volumes of data.</li>
80
+ <li><strong>Knowledge Graph Construction:</strong> Companies can use the model to extract entities and build comprehensive knowledge graphs from textual data.</li>
81
+ <li><strong>Research and Development:</strong> The model aids in identifying specific entities in scientific and technical documents, facilitating better research and analysis.</li>
82
+ </ul>
83
+ <p>Why use the <strong>nerdl_fewnerd_subentity_100d</strong> model?</p>
84
+ <ul>
85
+ <li><strong>Pre-trained on Few-NERD Dataset:</strong> The model is specifically trained on diverse general data, making it well-suited for handling a wide range of text types.</li>
86
+ <li><strong>High Accuracy:</strong> The model achieves impressive precision and recall, ensuring reliable entity detection.</li>
87
+ <li><strong>Ease of Use:</strong> Simplifies the process of entity recognition with minimal setup required.</li>
88
+ </ul>
89
+ </div>
90
+ """, unsafe_allow_html=True)
91
+
92
+ # Predicted Entities
93
+ st.markdown('<div class="sub-title">Predicted Entities</div>', unsafe_allow_html=True)
94
+ # st.markdown("""
95
+ # <div class="section">
96
+ # <ul>
97
+ # <li><strong>building-theater</strong></li>
98
+ # <li><strong>art-other</strong></li>
99
+ # <li><strong>location-bodiesofwater</strong></li>
100
+ # <li><strong>other-god</strong></li>
101
+ # <li><strong>organization-politicalparty</strong></li>
102
+ # <li><strong>product-other</strong></li>
103
+ # <li><strong>building-sportsfacility</strong></li>
104
+ # <li><strong>building-restaurant</strong></li>
105
+ # <li><strong>organization-sportsleague</strong></li>
106
+ # <li><strong>event-election</strong></li>
107
+ # <li><strong>organization-media/newspaper</strong></li>
108
+ # <li><strong>product-software</strong></li>
109
+ # <li><strong>other-educationaldegree</strong></li>
110
+ # <li><strong>person-politician</strong></li>
111
+ # <li><strong>person-soldier</strong></li>
112
+ # <li><strong>other-disease</strong></li>
113
+ # <li><strong>product-airplane</strong></li>
114
+ # <li><strong>person-athlete</strong></li>
115
+ # <li><strong>location-mountain</strong></li>
116
+ # <li><strong>organization-company</strong></li>
117
+ # <li><strong>other-biologything</strong></li>
118
+ # <li><strong>location-other</strong></li>
119
+ # <li><strong>other-livingthing</strong></li>
120
+ # <li><strong>person-actor</strong></li>
121
+ # <li><strong>organization-other</strong></li>
122
+ # <li><strong>event-protest</strong></li>
123
+ # <li><strong>art-film</strong></li>
124
+ # <li><strong>other-award</strong></li>
125
+ # <li><strong>other-astronomything</strong></li>
126
+ # <li><strong>building-airport</strong></li>
127
+ # <li><strong>product-food</strong></li>
128
+ # <li><strong>person-other</strong></li>
129
+ # <li><strong>event-disaster</strong></li>
130
+ # <li><strong>product-weapon</strong></li>
131
+ # <li><strong>event-sportsevent</strong></li>
132
+ # <li><strong>location-park</strong></li>
133
+ # <li><strong>product-ship</strong></li>
134
+ # <li><strong>building-library</strong></li>
135
+ # <li><strong>art-painting</strong></li>
136
+ # <li><strong>building-other</strong></li>
137
+ # <li><strong>other-currency</strong></li>
138
+ # <li><strong>organization-education</strong></li>
139
+ # <li><strong>person-scholar</strong></li>
140
+ # <li><strong>organization-showorganization</strong></li>
141
+ # <li><strong>person-artist/author</strong></li>
142
+ # <li><strong>product-train</strong></li>
143
+ # <li><strong>location-GPE</strong></li>
144
+ # <li><strong>product-car</strong></li>
145
+ # <li><strong>art-writtenart</strong></li>
146
+ # <li><strong>event-attack/battle/war/militaryconflict</strong></li>
147
+ # <li><strong>other-law</strong></li>
148
+ # <li><strong>other-medical</strong></li>
149
+ # <li><strong>organization-sportsteam</strong></li>
150
+ # <li><strong>art-broadcastprogram</strong></li>
151
+ # <li><strong>art-music</strong></li>
152
+ # <li><strong>organization-government/governmentagency</strong></li>
153
+ # <li><strong>other-language</strong></li>
154
+ # <li><strong>event-other</strong></li>
155
+ # <li><strong>person-director</strong></li>
156
+ # <li><strong>other-chemicalthing</strong></li>
157
+ # <li><strong>product-game</strong></li>
158
+ # <li><strong>organization-religion</strong></li>
159
+ # <li><strong>location-road/railway/highway/transit</strong></li>
160
+ # <li><strong>location-island</strong></li>
161
+ # <li><strong>building-hotel</strong></li>
162
+ # <li><strong>building-hospital</strong></li>
163
+ # </ul>
164
+ # </div>
165
+ # """, unsafe_allow_html=True)
166
+
167
+ st.markdown("""<div class="section"><p><code class="language-plaintext highlighter-rouge">building-theater</code>, <code class="language-plaintext highlighter-rouge">art-other</code>, <code class="language-plaintext highlighter-rouge">location-bodiesofwater</code>, <code class="language-plaintext highlighter-rouge">other-god</code>, <code class="language-plaintext highlighter-rouge">organization-politicalparty</code>, <code class="language-plaintext highlighter-rouge">product-other</code>, <code class="language-plaintext highlighter-rouge">building-sportsfacility</code>, <code class="language-plaintext highlighter-rouge">building-restaurant</code>, <code class="language-plaintext highlighter-rouge">organization-sportsleague</code>, <code class="language-plaintext highlighter-rouge">event-election</code>, <code class="language-plaintext highlighter-rouge">organization-media/newspaper</code>, <code class="language-plaintext highlighter-rouge">product-software</code>, <code class="language-plaintext highlighter-rouge">other-educationaldegree</code>, <code class="language-plaintext highlighter-rouge">person-politician</code>, <code class="language-plaintext highlighter-rouge">person-soldier</code>, <code class="language-plaintext highlighter-rouge">other-disease</code>, <code class="language-plaintext highlighter-rouge">product-airplane</code>, <code class="language-plaintext highlighter-rouge">person-athlete</code>, <code class="language-plaintext highlighter-rouge">location-mountain</code>, <code class="language-plaintext highlighter-rouge">organization-company</code>, <code class="language-plaintext highlighter-rouge">other-biologything</code>, <code class="language-plaintext highlighter-rouge">location-other</code>, <code class="language-plaintext highlighter-rouge">other-livingthing</code>, <code class="language-plaintext highlighter-rouge">person-actor</code>, <code class="language-plaintext highlighter-rouge">organization-other</code>, <code class="language-plaintext highlighter-rouge">event-protest</code>, <code class="language-plaintext highlighter-rouge">art-film</code>, <code class="language-plaintext highlighter-rouge">other-award</code>, <code class="language-plaintext highlighter-rouge">other-astronomything</code>, <code class="language-plaintext highlighter-rouge">building-airport</code>, <code class="language-plaintext highlighter-rouge">product-food</code>, <code class="language-plaintext highlighter-rouge">person-other</code>, <code class="language-plaintext highlighter-rouge">event-disaster</code>, <code class="language-plaintext highlighter-rouge">product-weapon</code>, <code class="language-plaintext highlighter-rouge">event-sportsevent</code>, <code class="language-plaintext highlighter-rouge">location-park</code>, <code class="language-plaintext highlighter-rouge">product-ship</code>, <code class="language-plaintext highlighter-rouge">building-library</code>, <code class="language-plaintext highlighter-rouge">art-painting</code>, <code class="language-plaintext highlighter-rouge">building-other</code>, <code class="language-plaintext highlighter-rouge">other-currency</code>, <code class="language-plaintext highlighter-rouge">organization-education</code>, <code class="language-plaintext highlighter-rouge">person-scholar</code>, <code class="language-plaintext highlighter-rouge">organization-showorganization</code>, <code class="language-plaintext highlighter-rouge">person-artist/author</code>, <code class="language-plaintext highlighter-rouge">product-train</code>, <code class="language-plaintext highlighter-rouge">location-GPE</code>, <code class="language-plaintext highlighter-rouge">product-car</code>, <code class="language-plaintext highlighter-rouge">art-writtenart</code>, <code class="language-plaintext highlighter-rouge">event-attack/battle/war/militaryconflict</code>, <code class="language-plaintext highlighter-rouge">other-law</code>, <code class="language-plaintext highlighter-rouge">other-medical</code>, <code class="language-plaintext highlighter-rouge">organization-sportsteam</code>, <code class="language-plaintext highlighter-rouge">art-broadcastprogram</code>, <code class="language-plaintext highlighter-rouge">art-music</code>, <code class="language-plaintext highlighter-rouge">organization-government/governmentagency</code>, <code class="language-plaintext highlighter-rouge">other-language</code>, <code class="language-plaintext highlighter-rouge">event-other</code>, <code class="language-plaintext highlighter-rouge">person-director</code>, <code class="language-plaintext highlighter-rouge">other-chemicalthing</code>, <code class="language-plaintext highlighter-rouge">product-game</code>, <code class="language-plaintext highlighter-rouge">organization-religion</code>, <code class="language-plaintext highlighter-rouge">location-road/railway/highway/transit</code>, <code class="language-plaintext highlighter-rouge">location-island</code>, <code class="language-plaintext highlighter-rouge">building-hotel</code>, <code class="language-plaintext highlighter-rouge">building-hospital</code></p></div>""", unsafe_allow_html=True)
168
+
169
+ # How to Use the Model
170
+ st.markdown('<div class="sub-title">How to Use the Model</div>', unsafe_allow_html=True)
171
+ st.code('''
172
+ from sparknlp.base import *
173
+ from sparknlp.annotator import *
174
+ from pyspark.ml import Pipeline
175
+ from pyspark.sql.functions import col, expr
176
+
177
+ # Load the pre-trained model
178
+ document_assembler = DocumentAssembler() \\
179
+ .setInputCol("text") \\
180
+ .setOutputCol("document")
181
+
182
+ sentence_detector = SentenceDetector() \\
183
+ .setInputCols(["document"]) \\
184
+ .setOutputCol("sentence")
185
+
186
+ tokenizer = Tokenizer() \\
187
+ .setInputCols(["sentence"]) \\
188
+ .setOutputCol("token")
189
+
190
+ embeddings = WordEmbeddingsModel.pretrained("glove_100d", "en")\\
191
+ .setInputCols("sentence", "token") \\
192
+ .setOutputCol("embeddings")
193
+
194
+ ner_tagger = NerDLModel.pretrained("nerdl_fewnerd_subentity_100d", "en") \\
195
+ .setInputCols(['sentence', 'token', 'embeddings']) \\
196
+ .setOutputCol("ner")
197
+
198
+ ner_converter = NerConverter() \\
199
+ .setInputCols(["document", "token", "ner"]) \\
200
+ .setOutputCol("ner_chunk")
201
+
202
+ pipeline = Pipeline(stages=[
203
+ document_assembler,
204
+ sentence_detector,
205
+ tokenizer,
206
+ embeddings,
207
+ ner_tagger,
208
+ ner_converter
209
+ ])
210
+
211
+ # Sample text
212
+ text = """
213
+ In 2023, Apple Inc. announced the release of their new iPhone 15 at a major event held in San Francisco.
214
+ The announcement was made by Tim Cook, the CEO of Apple, who highlighted the innovative features of the device,
215
+ including its advanced camera system and improved battery life. The event took place on September 12, 2023,
216
+ and was streamed live on the company's official website.
217
+ During the event, several prominent tech bloggers, such as John Doe from TechCrunch and Jane Smith from The Verge,
218
+ were present to cover the announcement. Additionally, the event featured a surprise appearance by popular musician
219
+ Taylor Swift, who performed her hit single "Anti-Hero." The new iPhone 15 will be available for pre-order starting
220
+ on September 15, 2023, and is expected to hit the stores on September 22, 2023.
221
+ """
222
+
223
+ # Create a DataFrame with the text
224
+ data = spark.createDataFrame([[text]]).toDF("text")
225
+
226
+ # Apply the pipeline to the data
227
+ model = pipeline.fit(data)
228
+ result = model.transform(data)
229
+
230
+ # Display results
231
+ result.select(
232
+ expr("explode(ner_chunk) as ner_chunk")
233
+ ).select(
234
+ col("ner_chunk.result").alias("chunk"),
235
+ col("ner_chunk.metadata.entity").alias("ner_label")
236
+ ).show(truncate=False)
237
+ ''', language='python')
238
+
239
+ st.text("""
240
+ +-------------+----------------------------+
241
+ |chunk |ner_label |
242
+ +-------------+----------------------------+
243
+ |Apple Inc. |organization-company |
244
+ |iPhone 15 |product-other |
245
+ |San Francisco|location-GPE |
246
+ |Apple |organization-company |
247
+ |company's |location-GPE |
248
+ |TechCrunch |organization-media/newspaper|
249
+ |Taylor Swift |person-artist/author |
250
+ |iPhone 15 |product-other |
251
+ +-------------+----------------------------+
252
+ """)
253
+
254
+ # Model Information
255
+ st.markdown('<div class="sub-title">Model Information</div>', unsafe_allow_html=True)
256
+ st.markdown("""
257
+ <table class="benchmark-table">
258
+ <tr>
259
+ <th>Attribute</th>
260
+ <th>Description</th>
261
+ </tr>
262
+ <tr>
263
+ <td><strong>Model Name</strong></td>
264
+ <td>nerdl_fewnerd_subentity_100d</td>
265
+ </tr>
266
+ <tr>
267
+ <td><strong>Type</strong></td>
268
+ <td>ner</td>
269
+ </tr>
270
+ <tr>
271
+ <td><strong>Compatibility</strong></td>
272
+ <td>Spark NLP 3.1.1+</td>
273
+ </tr>
274
+ <tr>
275
+ <td><strong>License</strong></td>
276
+ <td>Open Source</td>
277
+ </tr>
278
+ <tr>
279
+ <td><strong>Edition</strong></td>
280
+ <td>Official</td>
281
+ </tr>
282
+ <tr>
283
+ <td><strong>Input Labels</strong></td>
284
+ <td>[sentence, token, embeddings]</td>
285
+ </tr>
286
+ <tr>
287
+ <td><strong>Output Labels</strong></td>
288
+ <td>[ner]</td>
289
+ </tr>
290
+ <tr>
291
+ <td><strong>Language</strong></td>
292
+ <td>en</td>
293
+ </tr>
294
+ </table>
295
+ """, unsafe_allow_html=True)
296
+
297
+ # Data Source Information
298
+ st.markdown('<div class="sub-title">Data Source Information</div>', unsafe_allow_html=True)
299
+ st.markdown("""
300
+ <table class="benchmark-table">
301
+ <tr>
302
+ <th>Attribute</th>
303
+ <th>Description</th>
304
+ </tr>
305
+ <tr>
306
+ <td><strong>Dataset</strong></td>
307
+ <td>Few-NERD: A Few-shot Named Entity Recognition Dataset</td>
308
+ </tr>
309
+ <tr>
310
+ <td><strong>Authors</strong></td>
311
+ <td>Ding, Ning; Xu, Guangwei; Chen, Yulin; Wang, Xiaobin; Han, Xu; Xie, Pengjun; Zheng, Hai-Tao; Liu, Zhiyuan</td>
312
+ </tr>
313
+ <tr>
314
+ <td><strong>Conference</strong></td>
315
+ <td>ACL-IJCNL 2021</td>
316
+ </tr>
317
+ </table>
318
+ """, unsafe_allow_html=True)
319
+
320
+ # Benchmarking Results Description
321
+ st.markdown('<div class="sub-title">Benchmarking Results</div>', unsafe_allow_html=True)
322
+
323
+ st.markdown("""
324
+ <div class="section">
325
+ <table class="benchmark-table">
326
+ <thead>
327
+ <tr>
328
+ <th>Metric</th>
329
+ <th>Score</th>
330
+ </tr>
331
+ </thead>
332
+ <tbody>
333
+ <tr>
334
+ <td>Precision</td>
335
+ <td>89.45%</td>
336
+ </tr>
337
+ <tr>
338
+ <td>Recall</td>
339
+ <td>91.67%</td>
340
+ </tr>
341
+ <tr>
342
+ <td>F1-Score</td>
343
+ <td>90.55%</td>
344
+ </tr>
345
+ </tbody>
346
+ </table>
347
+ </div>
348
+ """, unsafe_allow_html=True)
349
+ st.markdown("""
350
+ <div class="section">
351
+ <p>The benchmarking results highlight the performance of the <strong>nerdl_fewnerd_subentity_100d</strong> model. The metrics used are:</p>
352
+ <ul>
353
+ <li><strong>Precision:</strong> The percentage of correctly identified entities out of all entities identified by the model.</li>
354
+ <li><strong>Recall:</strong> The percentage of correctly identified entities out of all entities that should have been identified.</li>
355
+ <li><strong>F1-Score:</strong> The harmonic mean of precision and recall, providing a balanced measure of the model's performance.</li>
356
+ </ul>
357
+ <p>The scores indicate that the model achieves high accuracy and reliability in detecting entities within general scope texts.</p>
358
+ </div>
359
+ """, unsafe_allow_html=True)
360
+
361
+ # Conclusion
362
+ st.markdown('<div class="sub-title">Conclusion</div>', unsafe_allow_html=True)
363
+ st.markdown("""
364
+ <div class="section">
365
+ <p>The <strong>nerdl_fewnerd_subentity_100d</strong> model is a powerful tool for entity recognition in general texts, offering high accuracy across a diverse set of entities. Its robust performance, as demonstrated by the benchmark results, makes it suitable for various applications such as text analysis, content classification, and knowledge graph construction. By utilizing this model, users can effectively extract and categorize entities, enhancing their ability to analyze and understand textual data.</p>
366
+ <p>For more information and to access the model, visit the <a href="https://nlp.johnsnowlabs.com/2023/01/30/nerdl_fewnerd_subentity_100d_en.html" class="link">John Snow Labs Model Page</a> or the <a href="https://github.com/JohnSnowLabs/spark-nlp" class="link">Spark NLP GitHub Repository</a>.</p>
367
+ </div>
368
+ """, unsafe_allow_html=True)
369
+
370
+ # References
371
+ st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
372
+ st.markdown("""
373
+ <div class="section">
374
+ <ul>
375
+ <li><a class="link" href="https://sparknlp.org/api/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForTokenClassification.html" target="_blank" rel="noopener">BertForTokenClassification</a> annotator documentation</li>
376
+ <li>Model Used: <a class="link" href="https://sparknlp.org/2021/09/09/bert_token_classifier_ner_btc_en.html" rel="noopener">bert_token_classifier_ner_btc_en</a></li>
377
+ <li><a class="link" href="https://nlp.johnsnowlabs.com/recognize_entitie" target="_blank" rel="noopener">Visualization demos for NER in Spark NLP</a></li>
378
+ <li><a class="link" href="https://www.johnsnowlabs.com/named-entity-recognition-ner-with-bert-in-spark-nlp/">Named Entity Recognition (NER) with BERT in Spark NLP</a></li>
379
+ </ul>
380
+ </div>
381
+ """, unsafe_allow_html=True)
382
+
383
+ # Community & Support
384
+ st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
385
+ st.markdown("""
386
+ <div class="section">
387
+ <ul>
388
+ <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
389
+ <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
390
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
391
+ <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
392
+ <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
393
+ </ul>
394
+ </div>
395
+ """, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ st-annotated-text
3
+ pandas
4
+ numpy
5
+ spark-nlp
6
+ pyspark