abdullahmubeen10 commited on
Commit
4947cdd
ยท
verified ยท
1 Parent(s): dc6bf5d

Upload 10 files

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ primaryColor="#29B4E8"
Demo.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ import pandas as pd
5
+
6
+ from sparknlp.base import *
7
+ from sparknlp.annotator import *
8
+ from pyspark.ml import Pipeline
9
+ from sparknlp.pretrained import PretrainedPipeline
10
+ from annotated_text import annotated_text
11
+
12
+ # Page configuration
13
+ st.set_page_config(
14
+ layout="wide",
15
+ page_title="Spark NLP Demos App",
16
+ initial_sidebar_state="auto"
17
+ )
18
+
19
+ # CSS for styling
20
+ st.markdown("""
21
+ <style>
22
+ .main-title {
23
+ font-size: 36px;
24
+ color: #4A90E2;
25
+ font-weight: bold;
26
+ text-align: center;
27
+ }
28
+ .section p, .section ul {
29
+ color: #666666;
30
+ }
31
+ .stTable {
32
+ margin-left: auto;
33
+ margin-right: auto;
34
+ }
35
+ </style>
36
+ """, unsafe_allow_html=True)
37
+
38
+ @st.cache_resource
39
+ def init_spark():
40
+ return sparknlp.start()
41
+
42
+ @st.cache_resource
43
+ def create_pipeline(model):
44
+ document_assembler = DocumentAssembler() \
45
+ .setInputCol('text') \
46
+ .setOutputCol('document')
47
+
48
+ sentence_detector = SentenceDetector() \
49
+ .setInputCols(['document']) \
50
+ .setOutputCol('sentences')
51
+
52
+ tokenizer = Tokenizer() \
53
+ .setInputCols(['sentences']) \
54
+ .setOutputCol('tokens') \
55
+ .setContextChars(['(', ')', '?', '!', '.', ','])
56
+
57
+ keywords = YakeKeywordExtraction() \
58
+ .setInputCols('tokens') \
59
+ .setOutputCol('keywords') \
60
+ .setMinNGrams(2) \
61
+ .setMaxNGrams(5) \
62
+ .setNKeywords(100) \
63
+ .setStopWords(StopWordsCleaner().getStopWords())
64
+
65
+ pipeline = Pipeline(stages=[
66
+ document_assembler,
67
+ sentence_detector,
68
+ tokenizer,
69
+ keywords
70
+ ])
71
+
72
+ return pipeline
73
+
74
+ def fit_data(pipeline, data):
75
+ empty_df = spark.createDataFrame([['']]).toDF('text')
76
+ pipeline_model = pipeline.fit(empty_df)
77
+ model = LightPipeline(pipeline_model)
78
+ results = model.fullAnnotate(data)[0]
79
+
80
+ return results
81
+
82
+ def highlight_keywords(data):
83
+ document_text = data["document"][0].result
84
+ keywords = data["keywords"]
85
+ annotations = []
86
+ last_index = 0
87
+
88
+ for keyword in keywords:
89
+ keyword_text = keyword.result
90
+ start_index = document_text.find(keyword_text, last_index)
91
+ if start_index != -1:
92
+ if start_index > last_index:
93
+ annotations.append(document_text[last_index:start_index])
94
+ annotations.append((keyword_text, 'Key Word'))
95
+ last_index = start_index + len(keyword_text)
96
+
97
+ if last_index < len(document_text):
98
+ annotations.append(document_text[last_index:])
99
+
100
+ annotated_text(*annotations)
101
+
102
+ # Set up the page layout
103
+ st.markdown('<div class="main-title">Detect Key Phrases With Spark NLP</div>', unsafe_allow_html=True)
104
+
105
+ # Sidebar content
106
+ model = st.sidebar.selectbox(
107
+ "Choose the pretrained model",
108
+ ["yake_model"],
109
+ help="For more info about the models visit: https://sparknlp.org/models"
110
+ )
111
+
112
+ # Reference notebook link in sidebar
113
+ link = """
114
+ <a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/KEYPHRASE_EXTRACTION.ipynb">
115
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
116
+ </a>
117
+ """
118
+ st.sidebar.markdown('Reference notebook:')
119
+ st.sidebar.markdown(link, unsafe_allow_html=True)
120
+
121
+ # Load examples
122
+ folder_path = f"inputs/{model}"
123
+ examples = [
124
+ lines[1].strip()
125
+ for filename in os.listdir(folder_path)
126
+ if filename.endswith('.txt')
127
+ for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()]
128
+ if len(lines) >= 2
129
+ ]
130
+
131
+ selected_text = st.selectbox("Select a sample text", examples)
132
+ custom_input = st.text_input("Try it for yourself!")
133
+
134
+ if custom_input:
135
+ selected_text = custom_input
136
+ elif selected_text:
137
+ selected_text = selected_text
138
+
139
+ st.subheader('Selected Text')
140
+ st.write(selected_text)
141
+
142
+ # Initialize Spark and create pipeline
143
+ spark = init_spark()
144
+ pipeline = create_pipeline(model)
145
+ output = fit_data(pipeline, selected_text)
146
+
147
+ # Display output
148
+ st.subheader("Annotated Document:")
149
+ highlight_keywords(output)
150
+
151
+ keys_df = pd.DataFrame([(k.result, k.begin, k.end, k.metadata['score'], k.metadata['sentence']) for k in output['keywords']],
152
+ columns=['keywords', 'begin', 'end', 'score', 'sentence'])
153
+ keys_df['score'] = keys_df['score'].astype(float)
154
+ # ordered by relevance
155
+ with st.expander("View Data Table"):
156
+ st.table(keys_df.sort_values(['sentence', 'score']))
Dockerfile ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Download base image ubuntu 18.04
2
+ FROM ubuntu:18.04
3
+
4
+ # Set environment variables
5
+ ENV NB_USER jovyan
6
+ ENV NB_UID 1000
7
+ ENV HOME /home/${NB_USER}
8
+
9
+ # Install required packages
10
+ RUN apt-get update && apt-get install -y \
11
+ tar \
12
+ wget \
13
+ bash \
14
+ rsync \
15
+ gcc \
16
+ libfreetype6-dev \
17
+ libhdf5-serial-dev \
18
+ libpng-dev \
19
+ libzmq3-dev \
20
+ python3 \
21
+ python3-dev \
22
+ python3-pip \
23
+ unzip \
24
+ pkg-config \
25
+ software-properties-common \
26
+ graphviz \
27
+ openjdk-8-jdk \
28
+ ant \
29
+ ca-certificates-java \
30
+ && apt-get clean \
31
+ && update-ca-certificates -f;
32
+
33
+ # Install Python 3.8 and pip
34
+ RUN add-apt-repository ppa:deadsnakes/ppa \
35
+ && apt-get update \
36
+ && apt-get install -y python3.8 python3-pip \
37
+ && apt-get clean;
38
+
39
+ # Set up JAVA_HOME
40
+ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
41
+ RUN mkdir -p ${HOME} \
42
+ && echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
43
+ && chown -R ${NB_UID}:${NB_UID} ${HOME}
44
+
45
+ # Create a new user named "jovyan" with user ID 1000
46
+ RUN useradd -m -u ${NB_UID} ${NB_USER}
47
+
48
+ # Switch to the "jovyan" user
49
+ USER ${NB_USER}
50
+
51
+ # Set home and path variables for the user
52
+ ENV HOME=/home/${NB_USER} \
53
+ PATH=/home/${NB_USER}/.local/bin:$PATH
54
+
55
+ # Set the working directory to the user's home directory
56
+ WORKDIR ${HOME}
57
+
58
+ # Upgrade pip and install Python dependencies
59
+ RUN python3.8 -m pip install --upgrade pip
60
+ COPY requirements.txt /tmp/requirements.txt
61
+ RUN python3.8 -m pip install -r /tmp/requirements.txt
62
+
63
+ # Copy the application code into the container at /home/jovyan
64
+ COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
65
+
66
+ # Expose port for Streamlit
67
+ EXPOSE 7860
68
+
69
+ # Define the entry point for the container
70
+ ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
inputs/yake_model/Example1.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Extracting keywords from texts has become a challenge for individuals and organizations as the infor...
2
+ Extracting keywords from texts has become a challenge for individuals and organizations as the information grows in complexity and size. The need to automate this task so that text can be processed in a timely and adequate manner has led to the emergence of automatic keyword extraction tools. Yake is a novel feature-based system for multi-lingual keyword extraction, which supports texts of different sizes, domain or languages. Unlike other approaches, Yake does not rely on dictionaries nor thesauri, neither is trained against any corpora. Instead, it follows an unsupervised approach which builds upon features extracted from the text, making it thus applicable to documents written in different languages without the need for further knowledge. This can be beneficial for a large number of tasks and a plethora of situations where access to training corpora is either limited or restricted.
inputs/yake_model/Example2.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Iodine deficiency is a lack of the trace element iodine, an essential nutrient in the diet. It may r...
2
+ Iodine deficiency is a lack of the trace element iodine, an essential nutrient in the diet. It may result in metabolic problems such as goiter, sometimes as an endemic goiter as well as cretinism due to untreated congenital hypothyroidism, which results in developmental delays and other health problems. Iodine deficiency is an important global health issue, especially for fertile and pregnant women. It is also a preventable cause of intellectual disability.
3
+
4
+ Iodine is an essential dietary mineral for neurodevelopment among offsprings and toddlers. The thyroid hormones thyroxine and triiodothyronine contain iodine. In areas where there is little iodine in the diet, typically remote inland areas where no marine foods are eaten, iodine deficiency is common. It is also common in mountainous regions of the world where food is grown in iodine-poor soil.
5
+
6
+ Prevention includes adding small amounts of iodine to table salt, a product known as iodized salt. Iodine compounds have also been added to other foodstuffs, such as flour, water and milk, in areas of deficiency. Seafood is also a well known source of iodine.
inputs/yake_model/Example3.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ The Prague Quadrennial of Performance Design and Space was established in 1967 to bring the best of ...
2
+ The Prague Quadrennial of Performance Design and Space was established in 1967 to bring the best of design for performance, scenography, and theatre architecture to the front line of cultural activities to be experienced by professional and emerging artists as well as the general public. The quadrennial exhibitions, festivals, and educational programs act as a global catalyst of creative progress by encouraging experimentation, networking, innovation, and future collaborations. PQ aims to honor, empower and celebrate the work of designers, artists and architects while inspiring and educating audiences, who are the most essential element of any live performance. The Prague Quadrennial strives to present performance design as an art form concerned with creation of active performance environments, that are far beyond merely decorative or beautiful, but emotionally charged, where design can become a quest, a question, an argument, a threat, a resolution, an agent of change, or a provocation. Performance design is a collaborative field where designers mix, fuse and blur the lines between multiple artistic disciplines to search for new approaches and new visions.
3
+
4
+ The Prague Quadrennial organizes an expansive program of international projects and activities between the main quadrennial events โ€“ performances, exhibitions, symposia, workshops, residencies, and educational initiatives serve as an international platform for exploring the practice, theory and education of contemporary performance design in the most encompassing terms.
inputs/yake_model/Example4.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Author Nathan Wiseman-Trowse explained that the "approach to the sheer physicality of sound" integra...
2
+ Author Nathan Wiseman-Trowse explained that the "approach to the sheer physicality of sound" integral to dream pop was "arguably pioneered in popular music by figures such as Phil Spector and Brian Wilson". The music of the Velvet Underground in the 1960s and 1970s, which experimented with repetition, tone, and texture over conventional song structure, was also an important touchstone in the genre's development George Harrison's 1970 album All Things Must Pass, with its Spector-produced Wall of Sound and fluid arrangements, led music journalist John Bergstrom to credit it as a progenitor of the genre.
3
+
4
+ Reynolds described dream pop bands as "a wave of hazy neo-psychedelic groups", noting the influence of the "ethereal soundscapes" of bands such as Cocteau Twins. Rolling Stone's Kory Grow described "modern dream pop" as originating with the early 1980s work of Cocteau Twins and their contemporaries, while PopMatters' AJ Ramirez noted an evolutionary line from gothic rock to dream pop. Grow considered Julee Cruise's 1989 album Floating into the Night, written and produced by David Lynch and Angelo Badalamenti, as a significant development of the dream pop sound which "gave the genre its synthy sheen." The influence of Cocteau Twins extended to the expansion of the genre's influence into Cantopop and Mandopop through the music of Faye Wong, who covered multiple Cocteau Twins songs, including tracks featured in Chungking Express, in which she also acted. Cocteau Twins would go on to collaborate with Wong on original songs of hers, and Wong contributed vocals to a limited release of a late Cocteau Twins single.
5
+
6
+ In the early 1990s, some dream pop acts influenced by My Bloody Valentine, such as Seefeel, were drawn to techno and began utilizing elements such as samples and sequenced rhythms. Ambient pop music was described by AllMusic as "essentially an extension of the dream pop that emerged in the wake of the shoegazer movement", distinct for its incorporation of electronic textures.
7
+
8
+ Much of the music associated with the 2009-coined term "chillwave" could be considered dream pop. In the opinion of Grantland's David Schilling, when "chillwave" was popularized, the discussion that followed among music journalists and bloggers revealed that labels such as "shoegaze" and "dream pop" were ultimately "arbitrary and meaningless".
inputs/yake_model/Example5.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ North Ingria was located in the Karelian Isthmus, between Finland and Soviet Russia. It was establis...
2
+ North Ingria was located in the Karelian Isthmus, between Finland and Soviet Russia. It was established 23 January 1919. The republic was first served by a post office at the Rautu railway station on the Finnish side of the border. As the access across the border was mainly restricted, the North Ingrian postal service was finally launched in the early 1920. The man behind the idea was the lieutenant colonel Georg Elfvengren, head of the governing council of North Ingria. He was also known as an enthusiastic stamp collector. The post office was opened at the capital village of Kirjasalo.
3
+
4
+ The first series of North Ingrian stamps were issued in 21 March 1920. They were based on the 1917 Finnish "Model Saarinen" series, a stamp designed by the Finnish architect Eliel Saarinen. The first series were soon sold to collectors, as the postage stamps became the major financial source of the North Ingrian government. The second series was designed for the North Ingrian postal service and issued 2 August 1920. The value of both series was in Finnish marks and similar to the postal fees of Finland. The number of letters sent from North Ingria was about 50 per day, most of them were carried to Finland. They were mainly sent by the personnel of the Finnish occupying forces. Large number of letters were also sent in pure philatelic purposes.
5
+
6
+ With the Treaty of Tartu, the area was re-integrated into Soviet Russia and the use of the North Ingrian postage stamps ended in 4 December 1920. Stamps were still sold in Finland in 1921 with an overprinting "Inkerin hyvรคksi" (For the Ingria), but they were no longer valid. Funds of the sale went for the North Ingrian refugees.
pages/Workflow & Model Overview.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ # Custom CSS for better styling
5
+ st.markdown("""
6
+ <style>
7
+ .main-title {
8
+ font-size: 36px;
9
+ color: #4A90E2;
10
+ font-weight: bold;
11
+ text-align: center;
12
+ }
13
+ .sub-title {
14
+ font-size: 24px;
15
+ color: #4A90E2;
16
+ margin-top: 20px;
17
+ }
18
+ .section {
19
+ background-color: #f9f9f9;
20
+ padding: 15px;
21
+ border-radius: 10px;
22
+ margin-top: 20px;
23
+ }
24
+ .section h2 {
25
+ font-size: 22px;
26
+ color: #4A90E2;
27
+ }
28
+ .section p, .section ul {
29
+ color: #666666;
30
+ }
31
+ .link {
32
+ color: #4A90E2;
33
+ text-decoration: none;
34
+ }
35
+ </style>
36
+ """, unsafe_allow_html=True)
37
+
38
+ # Introduction
39
+ st.markdown('<div class="main-title">Keyword Extraction from Texts with Python and Spark NLP</div>', unsafe_allow_html=True)
40
+
41
+ st.markdown("""
42
+ <div class="section">
43
+ <p>Welcome to the Spark NLP Keyword Extraction Demo App! Keyword extraction is a technique in natural language processing (NLP) that involves automatically identifying the most important words or phrases in a document or corpus. Keywords extracted from a text can be used in a variety of ways, including:</p>
44
+ <ul>
45
+ <li>Document indexing</li>
46
+ <li>Document summarization</li>
47
+ <li>Content categorization</li>
48
+ <li>Content tagging</li>
49
+ <li>Search engine optimization</li>
50
+ </ul>
51
+ <p>This app demonstrates how to use Spark NLP's YakeKeywordExtraction annotator to perform keyword extraction using Python.</p>
52
+ </div>
53
+ """, unsafe_allow_html=True)
54
+
55
+ # About Keyword Extraction
56
+ st.markdown('<div class="sub-title">About Keyword Extraction</div>', unsafe_allow_html=True)
57
+ st.markdown("""
58
+ <div class="section">
59
+ <p>Extracting keywords from texts has become difficult for individuals and organizations as the complexity and volume of information have grown. The need to automate this task so that text can be processed promptly and adequately has led to the emergence of automatic keyword extraction tools. NLP and Python libraries help in the process.</p>
60
+ </div>
61
+ """, unsafe_allow_html=True)
62
+
63
+ # Using YakeKeywordExtraction in Spark NLP
64
+ st.markdown('<div class="sub-title">Using YakeKeywordExtraction in Spark NLP</div>', unsafe_allow_html=True)
65
+ st.markdown("""
66
+ <div class="section">
67
+ <p>Yake! is a novel feature-based system for multi-lingual keyword extraction, which supports texts of different sizes, domains, or languages. Unlike other approaches, Yake! does not rely on dictionaries or thesauri, nor is it trained against any corpora. Instead, it follows an unsupervised approach which builds upon features extracted from the text.</p>
68
+ </div>
69
+ """, unsafe_allow_html=True)
70
+
71
+ st.markdown('<h2 class="sub-title">Example Usage in Python</h2>', unsafe_allow_html=True)
72
+ st.markdown('<p>Hereโ€™s how you can implement keyword extraction using the YakeKeywordExtraction annotator in Spark NLP:</p>', unsafe_allow_html=True)
73
+
74
+ # Setup Instructions
75
+ st.markdown('<div class="sub-title">Setup</div>', unsafe_allow_html=True)
76
+ st.markdown('<p>To install Spark NLP and extract keywords in Python, simply use your favorite package manager (conda, pip, etc.). For example:</p>', unsafe_allow_html=True)
77
+ st.code("""
78
+ pip install spark-nlp
79
+ pip install pyspark
80
+ """, language="bash")
81
+
82
+ st.markdown("<p>Then, import Spark NLP and start a Spark session:</p>", unsafe_allow_html=True)
83
+ st.code("""
84
+ import sparknlp
85
+
86
+ # Start Spark Session
87
+ spark = sparknlp.start()
88
+ """, language='python')
89
+
90
+ # Keyword Extraction Example
91
+ st.markdown('<div class="sub-title">Example Usage: Keyword Extraction with YakeKeywordExtraction</div>', unsafe_allow_html=True)
92
+ st.code('''
93
+ from sparknlp.base import DocumentAssembler, Pipeline
94
+ from sparknlp.annotator import SentenceDetector, Tokenizer, YakeKeywordExtraction
95
+ import pyspark.sql.functions as F
96
+
97
+ # Step 1: Transforms raw texts to document annotation
98
+ document = DocumentAssembler() \\
99
+ .setInputCol("text") \\
100
+ .setOutputCol("document")
101
+
102
+ # Step 2: Sentence Detection
103
+ sentenceDetector = SentenceDetector() \\
104
+ .setInputCols(["document"]) \\
105
+ .setOutputCol("sentence")
106
+
107
+ # Step 3: Tokenization
108
+ token = Tokenizer() \\
109
+ .setInputCols(["sentence"]) \\
110
+ .setOutputCol("token") \\
111
+ .setContextChars(["(", ")", "?", "!", ".", ","])
112
+
113
+ # Step 4: Keyword Extraction
114
+ keywords = YakeKeywordExtraction() \\
115
+ .setInputCols(["token"]) \\
116
+ .setOutputCol("keywords")
117
+
118
+ # Define the pipeline
119
+ yake_pipeline = Pipeline(stages=[document, sentenceDetector, token, keywords])
120
+
121
+ # Create an empty dataframe
122
+ empty_df = spark.createDataFrame([['']]).toDF("text")
123
+
124
+ # Fit the dataframe to get the model
125
+ yake_model = yake_pipeline.fit(empty_df)
126
+
127
+ # Using LightPipeline
128
+ from sparknlp.base import LightPipeline
129
+
130
+ light_model = LightPipeline(yake_model)
131
+ text = """
132
+ google is acquiring data science community kaggle. Sources tell us that google is acquiring kaggle, a platform that hosts data science and machine learning competitions. Details about the transaction remain somewhat vague , but given that google is hosting its Cloud Next conference in san francisco this week, the official announcement could come as early as tomorrow. Reached by phone, kaggle co-founder ceo anthony goldbloom declined to deny that the acquisition is happening. google itself declined 'to comment on rumors'. kaggle, which has about half a million data scientists on its platform, was founded by Goldbloom and Ben Hamner in 2010. The service got an early start and even though it has a few competitors like DrivenData, TopCoder and HackerRank, it has managed to stay well ahead of them by focusing on its specific niche. The service is basically the de facto home for running data science and machine learning competitions. With kaggle, google is buying one of the largest and most active communities for data scientists - and with that, it will get increased mindshare in this community, too (though it already has plenty of that thanks to Tensorflow and other projects). kaggle has a bit of a history with google, too, but that's pretty recent. Earlier this month, google and kaggle teamed up to host a $100,000 machine learning competition around classifying YouTube videos. That competition had some deep integrations with the google Cloud platform, too. Our understanding is that google will keep the service running - likely under its current name. While the acquisition is probably more about Kaggle's community than technology, kaggle did build some interesting tools for hosting its competition and 'kernels', too. On kaggle, kernels are basically the source code for analyzing data sets and developers can share this code on the platform (the company previously called them 'scripts'). Like similar competition-centric sites, kaggle also runs a job board, too. It's unclear what google will do with that part of the service. According to Crunchbase, kaggle raised $12.5 million (though PitchBook says it's $12.75) since its launch in 2010. Investors in kaggle include Index Ventures, SV Angel, Max Levchin, Naval Ravikant, google chief economist Hal Varian, Khosla Ventures and Yuri Milner
133
+ """
134
+
135
+ light_result = light_model.fullAnnotate(text)[0]
136
+
137
+ import pandas as pd
138
+
139
+ keys_df = pd.DataFrame([(k.result, k.begin, k.end, k.metadata['score'], k.metadata['sentence']) for k in light_result['keywords']],
140
+ columns=['keywords', 'begin', 'end', 'score', 'sentence'])
141
+ keys_df['score'] = keys_df['score'].astype(float)
142
+ # ordered by relevance
143
+ keys_df.sort_values(['sentence', 'score']).head(100)
144
+ ''', language='python')
145
+
146
+ data = {
147
+ "Keyword": ["data science", "acquiring data", "google is acquiring", "community kaggle", "science community", "acquiring data science", "data science", "machine learning", "learning competitions", "acquiring kaggle", "google is acquiring", "hosts data", "science and machine", "google cloud", "cloud platform", "google cloud platform", "index ventures", "khosla ventures", "yuri milner", "sv angel", "max levchin", "naval ravikant", "hal varian", "cloud next", "next conference", "cloud next conference", "goldbloom declined", "anthony goldbloom", "data scientists", "ben hamner", "million data", "data science", "machine learning", "learning competitions", "running data", "science and machine", "data scientists", "machine learning"],
148
+ "Begin": [21, 11, 1, 34, 26, 11, 123, 140, 148, 83, 73, 117, 128, 1450, 1457, 1450, 2197, 2287, 2307, 2213, 2223, 2236, 2275, 262, 268, 262, 419, 411, 567, 629, 559, 895, 912, 920, 887, 900, 1024, 1333],
149
+ "End": [32, 24, 19, 49, 42, 32, 134, 155, 168, 98, 91, 126, 146, 1461, 1470, 1470, 2210, 2301, 2317, 2220, 2233, 2249, 2284, 271, 282, 282, 436, 427, 581, 638, 570, 906, 927, 940, 898, 918, 1038, 1348],
150
+ "Score": [0.255856, 0.844244, 1.039254, 1.040628, 1.152803, 1.263860, 0.255856, 0.466911, 0.762934, 0.849239, 1.039254, 1.203691, 1.257900, 0.611960, 0.796338, 1.070615, 0.904638, 0.904638, 1.008957, 1.045587, 1.045587, 1.045587, 1.045587, 0.514866, 0.994605, 1.242611, 1.078377, 1.323419, 0.562581, 0.623279, 1.210565, 0.255856, 0.466911, 0.762934, 1.183755, 1.257900, 0.562581, 0.466911],
151
+ "Sentence": [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 10, 10, 10, 17, 17, 17, 17, 17, 17, 17, 2, 2, 2, 3, 3, 4, 4, 4, 6, 6, 6, 6, 6, 7, 9]
152
+ }
153
+
154
+ df = pd.DataFrame(data)
155
+
156
+ st.markdown(
157
+ """
158
+ <style>
159
+ .stTable {
160
+ margin-left: auto;
161
+ margin-right: auto;
162
+ }
163
+ </style>
164
+ """,
165
+ unsafe_allow_html=True
166
+ )
167
+
168
+ with st.expander("View Data Table"):
169
+ st.table(df)
170
+
171
+ st.markdown("""
172
+ <p>The code snippet demonstrates how to set up a pipeline in Spark NLP to perform keyword extraction on text data using the YakeKeywordExtraction annotator. The resulting DataFrame contains the keywords and their corresponding scores.</p>
173
+ """, unsafe_allow_html=True)
174
+
175
+ # Highlighting Keywords in a Text
176
+ st.markdown('<div class="sub-title">Highlighting Keywords in a Text</div>', unsafe_allow_html=True)
177
+ st.markdown("""
178
+ <div class="section">
179
+ <p>In addition to getting the keywords as a dataframe, it is also possible to highlight the extracted keywords in the text.</p>
180
+ <p>In this example, a dataset of 7537 texts were used โ€” samples from the PubMed, which is a free resource supporting the search and retrieval of biomedical and life sciences literature.</p>
181
+ </div>
182
+ """, unsafe_allow_html=True)
183
+
184
+ st.code("""
185
+ import re
186
+ from pyspark.sql.functions import udf
187
+ from pyspark.sql.types import StringType
188
+
189
+ def highlight_keywords(text, keywords):
190
+ for keyword in keywords:
191
+ text = re.sub(fr'\\b{keyword}\\b', f'**{keyword}**', text, flags=re.IGNORECASE)
192
+ return text
193
+
194
+ highlight_udf = udf(highlight_keywords, StringType())
195
+
196
+ df_with_highlights = df.withColumn("highlighted_text", highlight_udf("text", "keywords"))
197
+ df_with_highlights.select("highlighted_text").show(truncate=False)
198
+ """, language='python')
199
+
200
+ # Conclusion
201
+ st.markdown('<div class="sub-title">Conclusion</div>', unsafe_allow_html=True)
202
+ st.markdown("""
203
+ <div class="section">
204
+ <p>In this demo, we demonstrated how to extract keywords from texts using the YakeKeywordExtraction annotator in Spark NLP. We provided step-by-step instructions on setting up the environment, creating a pipeline, and running the keyword extraction. Additionally, we explored how to highlight extracted keywords in the text.</p>
205
+ </div>
206
+ """, unsafe_allow_html=True)
207
+
208
+ # References and Additional Information
209
+ st.markdown('<div class="sub-title">For additional information, please check the following references.</div>', unsafe_allow_html=True)
210
+
211
+ st.markdown("""
212
+ <div class="section">
213
+ <ul>
214
+ <li>Documentation <a class="link" href="https://nlp.johnsnowlabs.com/docs/en/annotators#yakekeywordextraction" target="_blank" rel="noopener">YakeKeywordExtraction</a></li>
215
+ <li>Python keyword extraction: Docs about are <a class="link" href="https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp/annotator/keyword_extraction/yake_keyword_extraction/index.html" target="_blank" rel="noopener">here</a></li>
216
+ <li>Scala Docs: <a class="link" href="https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/annotators/keyword/yake/YakeKeywordExtraction.html">YakeKeywordExtraction</a></li>
217
+ <li>For extended examples of usage, see the <a class="link" href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/8.Keyword_Extraction_YAKE.ipynb" target="_blank" rel="noopener nofollow">Spark NLP Workshop repository</a>.</li>
218
+ <li>Reference Paper: <a class="link" href="https://www.sciencedirect.com/science/article/abs/pii/S0020025519308588" target="_blank" rel="noopener nofollow">YAKE! Keyword extraction from single documents using multiple local features</a></li>
219
+ </ul>
220
+ </div>
221
+ """, unsafe_allow_html=True)
222
+
223
+ st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
224
+ st.markdown("""
225
+ <div class="section">
226
+ <ul>
227
+ <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
228
+ <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
229
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
230
+ <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
231
+ <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
232
+ </ul>
233
+ </div>
234
+ """, unsafe_allow_html=True)
235
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ spark-nlp
5
+ pyspark