Spaces:
Sleeping
Sleeping
Upload 12 files
Browse files- .streamlit/config.toml +3 -0
- Demo.py +121 -0
- Dockerfile +70 -0
- images/Coreference-Resolution.png +0 -0
- images/johnsnowlabs-output.png +0 -0
- inputs/date_matcher/Example1.txt +5 -0
- inputs/date_matcher/Example2.txt +5 -0
- inputs/date_matcher/Example3.txt +6 -0
- inputs/date_matcher/Example4.txt +4 -0
- inputs/date_matcher/Example5.txt +3 -0
- pages/Workflow & Model Overview.py +216 -0
- requirements.txt +5 -0
.streamlit/config.toml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
base="light"
|
3 |
+
primaryColor="#29B4E8"
|
Demo.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import sparknlp
|
3 |
+
import os
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
from sparknlp.base import *
|
7 |
+
from sparknlp.annotator import *
|
8 |
+
from pyspark.ml import Pipeline
|
9 |
+
from sparknlp.pretrained import PretrainedPipeline
|
10 |
+
|
11 |
+
# Page configuration
|
12 |
+
st.set_page_config(
|
13 |
+
layout="wide",
|
14 |
+
page_title="Spark NLP Demos App",
|
15 |
+
initial_sidebar_state="auto"
|
16 |
+
)
|
17 |
+
|
18 |
+
# CSS for styling
|
19 |
+
st.markdown("""
|
20 |
+
<style>
|
21 |
+
.main-title {
|
22 |
+
font-size: 36px;
|
23 |
+
color: #4A90E2;
|
24 |
+
font-weight: bold;
|
25 |
+
text-align: center;
|
26 |
+
}
|
27 |
+
.section p, .section ul {
|
28 |
+
color: #666666;
|
29 |
+
}
|
30 |
+
</style>
|
31 |
+
""", unsafe_allow_html=True)
|
32 |
+
|
33 |
+
@st.cache_resource
|
34 |
+
def init_spark():
|
35 |
+
return sparknlp.start()
|
36 |
+
|
37 |
+
@st.cache_resource
|
38 |
+
def create_pipeline():
|
39 |
+
document = DocumentAssembler() \
|
40 |
+
.setInputCol("text") \
|
41 |
+
.setOutputCol("document")
|
42 |
+
|
43 |
+
# Step 2: Sentence Detection
|
44 |
+
sentenceDetector = SentenceDetector() \
|
45 |
+
.setInputCols("document") \
|
46 |
+
.setOutputCol("sentences")
|
47 |
+
|
48 |
+
# Step 3: Tokenization
|
49 |
+
token = Tokenizer() \
|
50 |
+
.setInputCols("sentences") \
|
51 |
+
.setOutputCol("tokens") \
|
52 |
+
.setContextChars(["(", ")", "?", "!", ".", ","])
|
53 |
+
|
54 |
+
# Step 4: Coreference Resolution
|
55 |
+
corefResolution = SpanBertCorefModel().pretrained("spanbert_base_coref") \
|
56 |
+
.setInputCols(["sentences", "tokens"]) \
|
57 |
+
.setOutputCol("corefs") \
|
58 |
+
.setCaseSensitive(False)
|
59 |
+
|
60 |
+
# Define the pipeline
|
61 |
+
pipeline = Pipeline(stages=[document, sentenceDetector, token, corefResolution])
|
62 |
+
|
63 |
+
def fit_data(pipeline, data):
|
64 |
+
empty_df = spark.createDataFrame([['']]).toDF('text')
|
65 |
+
pipeline_model = pipeline.fit(empty_df)
|
66 |
+
model = LightPipeline(pipeline_model)
|
67 |
+
results = model.fullAnnotate(data)
|
68 |
+
return results
|
69 |
+
|
70 |
+
# Set up the page layout
|
71 |
+
st.markdown('<div class="main-title">State-of-the-Art Coreference Resolution in Spark NLP</div>', unsafe_allow_html=True)
|
72 |
+
|
73 |
+
# Sidebar content
|
74 |
+
model_name = st.sidebar.selectbox(
|
75 |
+
"Choose the pretrained model",
|
76 |
+
['spanbert_base_coref'],
|
77 |
+
help="For more info about the models visit: https://sparknlp.org/models"
|
78 |
+
)
|
79 |
+
|
80 |
+
# Reference notebook link in sidebar
|
81 |
+
link = """
|
82 |
+
<a href="https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/coreference-resolution/Coreference_Resolution_SpanBertCorefModel.ipynb#L117">
|
83 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
|
84 |
+
</a>
|
85 |
+
"""
|
86 |
+
st.sidebar.markdown('Reference notebook:')
|
87 |
+
st.sidebar.markdown(link, unsafe_allow_html=True)
|
88 |
+
|
89 |
+
# Load examples
|
90 |
+
examples = [
|
91 |
+
"Alice went to the market. She bought some fresh vegetables there. The tomatoes she purchased were particularly ripe.",
|
92 |
+
"Dr. Smith is a renowned surgeon. He has performed over a thousand successful operations. His colleagues respect him a lot.",
|
93 |
+
"The company announced a new product launch. It is expected to revolutionize the industry. The CEO was very excited about it.",
|
94 |
+
"Jennifer enjoys hiking. She goes to the mountains every weekend. Her favorite spot is the Blue Ridge Mountains.",
|
95 |
+
"The team won the championship. They celebrated their victory with a huge party. Their coach praised their hard work and dedication.",
|
96 |
+
"Michael is studying computer science. He finds artificial intelligence fascinating. His dream is to work at a leading tech company.",
|
97 |
+
"The book was well-received by critics. It was praised for its intricate plot and well-developed characters. The author felt proud of his work.",
|
98 |
+
"Sarah adopted a kitten. She named it Whiskers. Whiskers loves to play with her and often follows her around the house.",
|
99 |
+
"The project was completed ahead of schedule. It was a collaborative effort. The team members were rewarded for their contribution.",
|
100 |
+
"Tom is a skilled guitarist. He plays in a local band. His performances are always energetic and captivating."
|
101 |
+
]
|
102 |
+
|
103 |
+
# st.subheader("Automatically detect phrases expressing dates and normalize them with respect to a reference date.")
|
104 |
+
selected_text = st.selectbox("Select an example", examples)
|
105 |
+
custom_input = st.text_input("Try it with your own Sentence!")
|
106 |
+
|
107 |
+
text_to_analyze = custom_input if custom_input else selected_text
|
108 |
+
|
109 |
+
st.subheader('Full example text')
|
110 |
+
st.write(text_to_analyze)
|
111 |
+
|
112 |
+
# Initialize Spark and create pipeline
|
113 |
+
spark = init_spark()
|
114 |
+
pipeline = create_pipeline()
|
115 |
+
output = fit_data(pipeline, text_to_analyze)
|
116 |
+
|
117 |
+
# Display matched sentence
|
118 |
+
st.subheader("Processed output:")
|
119 |
+
df = extract_to_dataframe(output)
|
120 |
+
df.index += 1
|
121 |
+
st.dataframe(df)
|
Dockerfile
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Download base image ubuntu 18.04
|
2 |
+
FROM ubuntu:18.04
|
3 |
+
|
4 |
+
# Set environment variables
|
5 |
+
ENV NB_USER jovyan
|
6 |
+
ENV NB_UID 1000
|
7 |
+
ENV HOME /home/${NB_USER}
|
8 |
+
|
9 |
+
# Install required packages
|
10 |
+
RUN apt-get update && apt-get install -y \
|
11 |
+
tar \
|
12 |
+
wget \
|
13 |
+
bash \
|
14 |
+
rsync \
|
15 |
+
gcc \
|
16 |
+
libfreetype6-dev \
|
17 |
+
libhdf5-serial-dev \
|
18 |
+
libpng-dev \
|
19 |
+
libzmq3-dev \
|
20 |
+
python3 \
|
21 |
+
python3-dev \
|
22 |
+
python3-pip \
|
23 |
+
unzip \
|
24 |
+
pkg-config \
|
25 |
+
software-properties-common \
|
26 |
+
graphviz \
|
27 |
+
openjdk-8-jdk \
|
28 |
+
ant \
|
29 |
+
ca-certificates-java \
|
30 |
+
&& apt-get clean \
|
31 |
+
&& update-ca-certificates -f;
|
32 |
+
|
33 |
+
# Install Python 3.8 and pip
|
34 |
+
RUN add-apt-repository ppa:deadsnakes/ppa \
|
35 |
+
&& apt-get update \
|
36 |
+
&& apt-get install -y python3.8 python3-pip \
|
37 |
+
&& apt-get clean;
|
38 |
+
|
39 |
+
# Set up JAVA_HOME
|
40 |
+
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
|
41 |
+
RUN mkdir -p ${HOME} \
|
42 |
+
&& echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
|
43 |
+
&& chown -R ${NB_UID}:${NB_UID} ${HOME}
|
44 |
+
|
45 |
+
# Create a new user named "jovyan" with user ID 1000
|
46 |
+
RUN useradd -m -u ${NB_UID} ${NB_USER}
|
47 |
+
|
48 |
+
# Switch to the "jovyan" user
|
49 |
+
USER ${NB_USER}
|
50 |
+
|
51 |
+
# Set home and path variables for the user
|
52 |
+
ENV HOME=/home/${NB_USER} \
|
53 |
+
PATH=/home/${NB_USER}/.local/bin:$PATH
|
54 |
+
|
55 |
+
# Set the working directory to the user's home directory
|
56 |
+
WORKDIR ${HOME}
|
57 |
+
|
58 |
+
# Upgrade pip and install Python dependencies
|
59 |
+
RUN python3.8 -m pip install --upgrade pip
|
60 |
+
COPY requirements.txt /tmp/requirements.txt
|
61 |
+
RUN python3.8 -m pip install -r /tmp/requirements.txt
|
62 |
+
|
63 |
+
# Copy the application code into the container at /home/jovyan
|
64 |
+
COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
|
65 |
+
|
66 |
+
# Expose port for Streamlit
|
67 |
+
EXPOSE 7860
|
68 |
+
|
69 |
+
# Define the entry point for the container
|
70 |
+
ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
images/Coreference-Resolution.png
ADDED
![]() |
images/johnsnowlabs-output.png
ADDED
![]() |
inputs/date_matcher/Example1.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
David visited the restaurant yesterday with his family. He also visited and the day before, but at ...
|
2 |
+
David visited the restaurant yesterday with his family.
|
3 |
+
He also visited and the day before, but at that time he was alone.
|
4 |
+
David again visited today with his colleagues.
|
5 |
+
He and his friends really liked the food and hoped to visit again tomorrow.
|
inputs/date_matcher/Example2.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
In March 2003 she was seen in the office and appeared to be extremely disturbed emotionally. On 2003...
|
2 |
+
In March 2003 she was seen in the office and appeared to be extremely disturbed emotionally.
|
3 |
+
On 2003-04-04 she again visited and talked about the effects of the medication she has been taking, and seemed positive and in much better shape.
|
4 |
+
She again visited on Fri, 22/4/2003 and looked better.
|
5 |
+
She has been working out and taking her medicines since April 1st 2003.
|
inputs/date_matcher/Example3.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
I have a very busy schedule these days. I have meetings from 7pm. till 11pm. I have 3 meetings the d...
|
2 |
+
I have a very busy schedule these days. I have meetings from 7pm. till 11pm.
|
3 |
+
I have 3 meetings the day after, and have submission deadlines approaching as well.
|
4 |
+
By next mon I have to finalise the architecture, for which i'll have to hold multiple meetings with ARM.
|
5 |
+
Then i'll have to discuss dev plans by next tuesday and develop a thorough plan.
|
6 |
+
The plan should be ready by Nov 30th.
|
inputs/date_matcher/Example4.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
When Tom visited the Bahamas last year, it was his first time travelling. Since then he was travelle...
|
2 |
+
When Tom visited the Bahamas last year, it was his first time travelling.
|
3 |
+
Since then he was travelled a lot. For example, he visited Hawaii last week.
|
4 |
+
The last time we talked, he was planning to travel to Alaska next month.
|
inputs/date_matcher/Example5.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
Isn't it weird that all my family members have the same birth day and month? All of us were born on ...
|
2 |
+
Isn't it weird that all my family members have the same birth day and month? All of us were born on 1st Jan
|
3 |
+
Dad was born on 01/01/1900. Mom has a birth date of 1st Jan 1902. And I was born on 2000/01/01
|
pages/Workflow & Model Overview.py
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
# Custom CSS for better styling
|
4 |
+
st.markdown("""
|
5 |
+
<style>
|
6 |
+
.main-title {
|
7 |
+
font-size: 36px;
|
8 |
+
color: #4A90E2;
|
9 |
+
font-weight: bold;
|
10 |
+
text-align: center;
|
11 |
+
}
|
12 |
+
.sub-title {
|
13 |
+
font-size: 24px;
|
14 |
+
color: #4A90E2;
|
15 |
+
margin-top: 20px;
|
16 |
+
}
|
17 |
+
.section {
|
18 |
+
background-color: #f9f9f9;
|
19 |
+
padding: 15px;
|
20 |
+
border-radius: 10px;
|
21 |
+
margin-top: 20px;
|
22 |
+
}
|
23 |
+
.section h2 {
|
24 |
+
font-size: 22px;
|
25 |
+
color: #4A90E2;
|
26 |
+
}
|
27 |
+
.section p, .section ul {
|
28 |
+
color: #666666;
|
29 |
+
}
|
30 |
+
.link {
|
31 |
+
color: #4A90E2;
|
32 |
+
text-decoration: none;
|
33 |
+
}
|
34 |
+
</style>
|
35 |
+
""", unsafe_allow_html=True)
|
36 |
+
|
37 |
+
# Introduction
|
38 |
+
st.markdown('<div class="main-title">Coreference Resolution with BERT-based Models in Spark NLP</div>', unsafe_allow_html=True)
|
39 |
+
|
40 |
+
st.markdown("""
|
41 |
+
<div class="section">
|
42 |
+
<p>Welcome to the Spark NLP Coreference Resolution Demo App! Coreference resolution is a crucial task in Natural Language Processing (NLP) that involves identifying and linking all expressions within a text that refer to the same real-world entity. This can be useful for a wide range of applications, such as text understanding, information extraction, and question answering.</p>
|
43 |
+
<p>Using Spark NLP, it is possible to perform coreference resolution with high accuracy using BERT-based models. This app demonstrates how to use the SpanBertCoref annotator to resolve coreferences in text data.</p>
|
44 |
+
</div>
|
45 |
+
""", unsafe_allow_html=True)
|
46 |
+
|
47 |
+
st.image('images/Coreference-Resolution.png', use_column_width='auto')
|
48 |
+
|
49 |
+
# About Coreference Resolution
|
50 |
+
st.markdown('<div class="sub-title">About Coreference Resolution</div>', unsafe_allow_html=True)
|
51 |
+
st.markdown("""
|
52 |
+
<div class="section">
|
53 |
+
<p>Coreference resolution is the task of identifying and linking all expressions within a text that refer to the same real-world entity, such as a person, object, or concept. This technique involves analyzing a text and identifying all expressions that refer to a specific entity, such as “he,” “she,” “it,” or “they.” These expressions are then linked together to form a “coreference chain,” representing all the different ways that entity is referred to in the text.</p>
|
54 |
+
<p>For example, given the sentence, “John went to the store. He bought some groceries,” a coreference resolution model would identify that “John” and “He” both refer to the same entity and produce a cluster of coreferent mentions.</p>
|
55 |
+
</div>
|
56 |
+
""", unsafe_allow_html=True)
|
57 |
+
|
58 |
+
# Using SpanBertCoref in Spark NLP
|
59 |
+
st.markdown('<div class="sub-title">Using SpanBertCoref in Spark NLP</div>', unsafe_allow_html=True)
|
60 |
+
st.markdown("""
|
61 |
+
<div class="section">
|
62 |
+
<p>The SpanBertCoref annotator in Spark NLP allows users to perform coreference resolution with high accuracy using BERT-based models. This annotator can identify and link expressions that refer to the same entity in text data, providing valuable insights from unstructured text data.</p>
|
63 |
+
<p>The SpanBertCoref annotator in Spark NLP offers:</p>
|
64 |
+
<ul>
|
65 |
+
<li>Accurate coreference resolution using BERT-based models</li>
|
66 |
+
<li>Identification and linking of multiple coreferent expressions</li>
|
67 |
+
<li>Efficient processing of large text datasets</li>
|
68 |
+
<li>Integration with other Spark NLP components for comprehensive NLP pipelines</li>
|
69 |
+
</ul>
|
70 |
+
</div>
|
71 |
+
""", unsafe_allow_html=True)
|
72 |
+
|
73 |
+
st.markdown('<h2 class="sub-title">Example Usage in Python</h2>', unsafe_allow_html=True)
|
74 |
+
st.markdown('<p>Here’s how you can implement coreference resolution using the SpanBertCoref annotator in Spark NLP:</p>', unsafe_allow_html=True)
|
75 |
+
|
76 |
+
# Setup Instructions
|
77 |
+
st.markdown('<div class="sub-title">Setup</div>', unsafe_allow_html=True)
|
78 |
+
st.markdown('<p>To install Spark NLP in Python, use your favorite package manager (conda, pip, etc.). For example:</p>', unsafe_allow_html=True)
|
79 |
+
st.code("""
|
80 |
+
pip install spark-nlp
|
81 |
+
pip install pyspark
|
82 |
+
""", language="bash")
|
83 |
+
|
84 |
+
st.markdown("<p>Then, import Spark NLP and start a Spark session:</p>", unsafe_allow_html=True)
|
85 |
+
st.code("""
|
86 |
+
import sparknlp
|
87 |
+
|
88 |
+
# Start Spark Session
|
89 |
+
spark = sparknlp.start()
|
90 |
+
""", language='python')
|
91 |
+
|
92 |
+
# Coreference Resolution Example
|
93 |
+
st.markdown('<div class="sub-title">Example Usage: Coreference Resolution with SpanBertCoref</div>', unsafe_allow_html=True)
|
94 |
+
st.code('''
|
95 |
+
from sparknlp.base import DocumentAssembler, Pipeline
|
96 |
+
from sparknlp.annotator import (
|
97 |
+
SentenceDetector,
|
98 |
+
Tokenizer,
|
99 |
+
SpanBertCorefModel
|
100 |
+
)
|
101 |
+
import pyspark.sql.functions as F
|
102 |
+
|
103 |
+
# Step 1: Transforms raw texts to document annotation
|
104 |
+
document = DocumentAssembler() \\
|
105 |
+
.setInputCol("text") \\
|
106 |
+
.setOutputCol("document")
|
107 |
+
|
108 |
+
# Step 2: Sentence Detection
|
109 |
+
sentenceDetector = SentenceDetector() \\
|
110 |
+
.setInputCols("document") \\
|
111 |
+
.setOutputCol("sentences")
|
112 |
+
|
113 |
+
# Step 3: Tokenization
|
114 |
+
token = Tokenizer() \\
|
115 |
+
.setInputCols("sentences") \\
|
116 |
+
.setOutputCol("tokens") \\
|
117 |
+
.setContextChars(["(", ")", "?", "!", ".", ","])
|
118 |
+
|
119 |
+
# Step 4: Coreference Resolution
|
120 |
+
corefResolution = SpanBertCorefModel().pretrained("spanbert_base_coref") \\
|
121 |
+
.setInputCols(["sentences", "tokens"]) \\
|
122 |
+
.setOutputCol("corefs") \\
|
123 |
+
.setCaseSensitive(False)
|
124 |
+
|
125 |
+
# Define the pipeline
|
126 |
+
pipeline = Pipeline(stages=[document, sentenceDetector, token, corefResolution])
|
127 |
+
|
128 |
+
# Create the dataframe
|
129 |
+
data = spark.createDataFrame([["Ana is a Graduate Student at UT Dallas. She loves working in Natural Language Processing at the Institute. Her hobbies include blogging, dancing, and singing."]]).toDF("text")
|
130 |
+
|
131 |
+
# Fit the dataframe to the pipeline to get the model
|
132 |
+
model = pipeline.fit(data)
|
133 |
+
|
134 |
+
# Transform the data to get predictions
|
135 |
+
result = model.transform(data)
|
136 |
+
|
137 |
+
# Display the extracted coreferences
|
138 |
+
result.selectExpr("explode(corefs) AS coref").selectExpr("coref.result as token", "coref.metadata").show(truncate=False)
|
139 |
+
''', language='python')
|
140 |
+
|
141 |
+
st.text("""
|
142 |
+
+-------------+----------------------------------------------------------------------------------------+
|
143 |
+
|token |metadata |
|
144 |
+
+-------------+----------------------------------------------------------------------------------------+
|
145 |
+
|ana |{head.sentence -> -1, head -> ROOT, head.begin -> -1, head.end -> -1, sentence -> 0} |
|
146 |
+
|she |{head.sentence -> 0, head -> ana, head.begin -> 0, head.end -> 2, sentence -> 1} |
|
147 |
+
|her |{head.sentence -> 0, head -> ana, head.begin -> 0, head.end -> 2, sentence -> 2} |
|
148 |
+
|ut dallas |{head.sentence -> -1, head -> ROOT, head.begin -> -1, head.end -> -1, sentence -> 0} |
|
149 |
+
|the institute|{head.sentence -> 0, head -> ut dallas, head.begin -> 29, head.end -> 37, sentence -> 1}|
|
150 |
+
+-------------+----------------------------------------------------------------------------------------+
|
151 |
+
""")
|
152 |
+
|
153 |
+
st.markdown("""
|
154 |
+
<p>The code snippet demonstrates how to set up a pipeline in Spark NLP to resolve coreferences in text data using the SpanBertCoref annotator. The resulting DataFrame contains the coreferent mentions and their metadata.</p>
|
155 |
+
""", unsafe_allow_html=True)
|
156 |
+
|
157 |
+
# One-liner Alternative
|
158 |
+
st.markdown('<div class="sub-title">One-liner Alternative</div>', unsafe_allow_html=True)
|
159 |
+
st.markdown("""
|
160 |
+
<div class="section">
|
161 |
+
<p>In October 2022, John Snow Labs released the open-source <code>johnsnowlabs</code> library that contains all the company products, open-source and licensed, under one common library. This simplified the workflow, especially for users working with more than one of the libraries (e.g., Spark NLP + Healthcare NLP). This new library is a wrapper on all of John Snow Lab’s libraries and can be installed with pip:</p>
|
162 |
+
<p><code>pip install johnsnowlabs</code></p>
|
163 |
+
<p>To run coreference resolution with one line of code, we can simply:</p>
|
164 |
+
</div>
|
165 |
+
""", unsafe_allow_html=True)
|
166 |
+
st.code("""
|
167 |
+
# Import the NLP module which contains Spark NLP and NLU libraries
|
168 |
+
from johnsnowlabs import nlp
|
169 |
+
|
170 |
+
sample_text = "Ana is a Graduate Student at UT Dallas. She loves working in Natural Language Processing at the Institute. Her hobbies include blogging, dancing, and singing."
|
171 |
+
|
172 |
+
# Returns a pandas DataFrame, we select the desired columns
|
173 |
+
nlp.load('en.coreference.spanbert').predict(sample_text, output_level='sentence')
|
174 |
+
""", language='python')
|
175 |
+
|
176 |
+
st.image('images/johnsnowlabs-output.png', use_column_width='auto')
|
177 |
+
|
178 |
+
st.markdown("""
|
179 |
+
<p>This approach demonstrates how to use the <code>johnsnowlabs</code> library to perform coreference resolution with a single line of code. The resulting DataFrame contains the coreferent mentions and their metadata.</p>
|
180 |
+
""", unsafe_allow_html=True)
|
181 |
+
|
182 |
+
# Conclusion
|
183 |
+
st.markdown("""
|
184 |
+
<div class="section">
|
185 |
+
<h2>Conclusion</h2>
|
186 |
+
<p>In this app, we demonstrated how to use Spark NLP's SpanBertCoref annotator to resolve coreferences in text data. These powerful tools enable users to efficiently process large datasets and identify coreferent mentions, providing deeper insights for various applications. By integrating these annotators into your NLP pipelines, you can enhance the extraction of valuable entity relationships from unstructured text, improving text understanding, information extraction, and question answering.</p>
|
187 |
+
</div>
|
188 |
+
""", unsafe_allow_html=True)
|
189 |
+
|
190 |
+
# References and Additional Information
|
191 |
+
st.markdown('<div class="sub-title">For additional information, please check the following references.</div>', unsafe_allow_html=True)
|
192 |
+
|
193 |
+
st.markdown("""
|
194 |
+
<div class="section">
|
195 |
+
<ul>
|
196 |
+
<li>Documentation : <a href="https://nlp.johnsnowlabs.com/docs/en/transformers#spanbertcoref" target="_blank" rel="noopener">SpanBertCoref</a></li>
|
197 |
+
<li>Python Docs : <a href="https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp/annotator/coref/spanbert_coref/index.html#sparknlp.annotator.coref.spanbert_coref.SpanBertCorefModel" target="_blank" rel="noopener">SpanBertCoref</a></li>
|
198 |
+
<li>Scala Docs : <a href="https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/annotators/coref/SpanBertCorefModel.html" target="_blank" rel="noopener">SpanBertCoref</a></li>
|
199 |
+
<li>Academic Reference Paper: SpanBERT: <a href="https://arxiv.org/abs/1907.10529" target="_blank" rel="noopener nofollow">Improving Pre-training by Representing and Predicting Spans</a></li>
|
200 |
+
<li>John Snow Labs <a href="https://nlp.johnsnowlabs.com/2022/06/14/spanbert_base_coref_en_3_0.html" target="_blank" rel="noopener">SpanBertCoref Model</a></li>
|
201 |
+
</ul>
|
202 |
+
</div>
|
203 |
+
""", unsafe_allow_html=True)
|
204 |
+
|
205 |
+
st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
|
206 |
+
st.markdown("""
|
207 |
+
<div class="section">
|
208 |
+
<ul>
|
209 |
+
<li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
|
210 |
+
<li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
|
211 |
+
<li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
|
212 |
+
<li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
|
213 |
+
<li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
|
214 |
+
</ul>
|
215 |
+
</div>
|
216 |
+
""", unsafe_allow_html=True)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pandas
|
3 |
+
numpy
|
4 |
+
spark-nlp
|
5 |
+
pyspark
|