abdullahmubeen10 commited on
Commit
f3d1f0c
Β·
verified Β·
1 Parent(s): 4c983c8

Update Demo.py

Browse files
Files changed (1) hide show
  1. Demo.py +158 -158
Demo.py CHANGED
@@ -1,158 +1,158 @@
1
- import streamlit as st
2
- import sparknlp
3
- import os
4
- import pandas as pd
5
-
6
- from sparknlp.base import *
7
- from sparknlp.annotator import *
8
- from pyspark.ml import Pipeline
9
- from sparknlp.pretrained import PretrainedPipeline
10
- from annotated_text import annotated_text
11
-
12
- # Page configuration
13
- st.set_page_config(
14
- layout="wide",
15
- initial_sidebar_state="auto"
16
- )
17
-
18
- # CSS for styling
19
- st.markdown("""
20
- <style>
21
- .main-title {
22
- font-size: 36px;
23
- color: #4A90E2;
24
- font-weight: bold;
25
- text-align: center;
26
- }
27
- .section {
28
- background-color: #f9f9f9;
29
- padding: 10px;
30
- border-radius: 10px;
31
- margin-top: 10px;
32
- }
33
- .section p, .section ul {
34
- color: #666666;
35
- }
36
- </style>
37
- """, unsafe_allow_html=True)
38
-
39
- @st.cache_resource
40
- def init_spark():
41
- return sparknlp.start()
42
-
43
- @st.cache_resource
44
- def create_pipeline(model):
45
- document_assembler = DocumentAssembler() \
46
- .setInputCol("text") \
47
- .setOutputCol("document")
48
-
49
- sentence_detector = SentenceDetector() \
50
- .setInputCols(["document"]) \
51
- .setOutputCol("sentence")
52
-
53
- word_segmenter = WordSegmenterModel.pretrained("wordseg_kaist_ud", "ko") \
54
- .setInputCols(["sentence"]) \
55
- .setOutputCol("token")
56
-
57
- embeddings = WordEmbeddingsModel.pretrained("glove_840B_300", "xx") \
58
- .setInputCols(["document", "token"]) \
59
- .setOutputCol("embeddings")
60
-
61
- ner = NerDLModel.pretrained("ner_kmou_glove_840B_300d", "ko") \
62
- .setInputCols(["document", "token", "embeddings"]) \
63
- .setOutputCol("ner")
64
-
65
- ner_converter = NerConverter().setInputCols(["document", "token", "ner"]).setOutputCol("ner_chunk")
66
-
67
- pipeline = Pipeline(stages=[document_assembler, sentence_detector, word_segmenter, embeddings, ner, ner_converter])
68
- return nlpPipeline
69
-
70
- def fit_data(pipeline, data):
71
- empty_df = spark.createDataFrame([['']]).toDF('text')
72
- pipeline_model = pipeline.fit(empty_df)
73
- model = LightPipeline(pipeline_model)
74
- result = model.fullAnnotate(data)
75
- return result
76
-
77
- def annotate(data):
78
- document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
79
- annotated_words = []
80
- for chunk, label in zip(chunks, labels):
81
- parts = document.split(chunk, 1)
82
- if parts[0]:
83
- annotated_words.append(parts[0])
84
- annotated_words.append((chunk, label))
85
- document = parts[1]
86
- if document:
87
- annotated_words.append(document)
88
- annotated_text(*annotated_words)
89
-
90
- # Set up the page layout
91
- st.markdown('<div class="main-title">Recognize entities in Urdu text</div>', unsafe_allow_html=True)
92
- st.markdown("""
93
- <div class="section">
94
- <p>This model uses the pre-trained <code>glove_840B_300</code> embeddings model from WordEmbeddings annotator as an input</p>
95
- </div>
96
- """, unsafe_allow_html=True)
97
-
98
- # Sidebar content
99
- model = st.sidebar.selectbox(
100
- "Choose the pretrained model",
101
- ["ner_kmou_glove_840B_300d"],
102
- help="For more info about the models visit: https://sparknlp.org/models"
103
- )
104
-
105
- # Reference notebook link in sidebar
106
- link = """
107
- <a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/public/NER_KO.ipynb">
108
- <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
109
- </a>
110
- """
111
- st.sidebar.markdown('Reference notebook:')
112
- st.sidebar.markdown(link, unsafe_allow_html=True)
113
-
114
- # Load examples
115
- examples = [
116
- """ARD , ZDF λ“± 곡영 TV 와 바이에λ₯Έμ£Ό 방솑 , 뢁뢀 독일 방솑 λ“± 은 이 λ‚  ν•œκ΅­ 의 총선 μ†Œμ‹ κ³Ό κ΄€λ ¨ , μ—¬λ‹Ή 의 과반수 μ˜μ„ 확보 와 μ‹ λ‹Ή 의 λ“ν‘œ 율 이 이번 μ„ κ±° 의 μ΅œλŒ€ 관심사 이 라고 보도 ν•˜ γ„΄ 데 μž‡ μ–΄ 저녁 μ‹œκ°„ λΆ€ν„° λŠ” 수 μ°¨λ‘€ 에 걸치 μ–΄ κ°œν‘œ 상황 κ³Ό μ •λ‹Ή 별 μ˜μ„ 전망 을 속보 둜 μ „ν•˜ μ•˜ λ‹€ .""",
117
- """두 λ‚˜λΌ 관계 λŠ” 쀑ꡭ 의 인ꢌ 문제 와 ν•΅ν™•μ‚° λ°©μ§€ 문제 , 톡상 문제 및 졜근 의 F 16 μ „νˆ¬κΈ° λŒ€ λŒ€λ§Œ 판맀 λ“± 을 놓 κ³  이미 μœ„ν—˜μ„  상 에 였 μ•„ 있 λŠ”λ° 클린턴 ν–‰μ •λΆ€ 의 λ“±μž₯ 으둜 μ–‘κ΅­ 관계 κ°€ λ”μš± 경색 되 γ„Ή 것 을 κ±±μ • ν•˜ λŠ” λΆ„μœ„κΈ° .""",
118
- """μ„œμšΈλŒ€ 건좕곡학 κ³Ό λ₯Ό μ‘Έμ—… ν•˜ γ„΄ 이 씨 λŠ” ν•œκ΅­κ±΄μΆ•κ°€ν˜‘νšŒ""",
119
- """λ‚˜ λŠ” λ‹€μ‹œ 순자 λ₯Ό 양동 μ—μ„œ λΉΌλ‚΄ κΈ° μœ„ν•˜ μ•„μ„œ μ°½μ‹  νŒ”λ™""",
120
- """헀라신전 μ„œ 채화 되 γ„΄ μ§€ 보름 , 지ꡬ 의 λ°˜λ°”ν€΄ λ₯Ό 돌 μ•„ μ œμ£Όκ³΅ν•­ 에 첫발 을 λ‚΄λ””λ”” γ„΄ 이래 둜 열이틀""",
121
- """λ‹€μŒ 은 홍콩 의 κΆŒμœ„μ§€ λͺ…보 와 일본 도쿄 ( 동경 ) μ‹ λ¬Έ 이 24일""",
122
- """졜 μ˜μ‚¬ κ°€ 우리 외ꡐ관 이 λ©° κ·Έ μ‹ λ³€λ³΄ν˜Έ μ±…μž„ 이 주재ꡭ 이 γ„΄ λŸ¬μ‹œμ•„ 에 있 λ‹€λŠ” 점 μ—μ„œ λŸ¬μ‹œμ•„ λŠ” 이 κ°™ 은 우리 μ •λΆ€ μš”κ΅¬ οΏ½οΏ½οΏ½ μ‘ν•˜ μ•„μ•Ό ν•˜ γ„Ή 의무 κ°€ 있 λ‹€ .""",
123
- """판 에 λ°• 은 λ“― ν•˜ γ„΄ 깨끗 ν•˜ γ„΄ 글씨 둜 , 처음 단ꡰ λ‹˜ 이 λ‹ˆ 신라 , 백제 , 고ꡬ렀 이 λ‹ˆ 띄엄띄엄 μ–΄λ₯Έ λ“€ ν•œν…Œ μ„œ κ·€κ²° 둜 λ“€μ–΄μ˜€ 던 μ–˜κΈ° λ“€ 이 참말 둜 μ”Œ μ–΄ 있 μ—ˆ λ‹€ ."""
124
- ]
125
-
126
- selected_text = st.selectbox("Select an example", examples)
127
- custom_input = st.text_input("Try it with your own Sentence!")
128
-
129
- text_to_analyze = custom_input if custom_input else selected_text
130
-
131
- st.subheader('Full example text')
132
- HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
133
- st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
134
-
135
- # Initialize Spark and create pipeline
136
- spark = init_spark()
137
- pipeline = create_pipeline(model)
138
- output = fit_data(pipeline, text_to_analyze)
139
-
140
- # Display matched sentence
141
- st.subheader("Processed output:")
142
-
143
- results = {
144
- 'Document': output[0]['document'][0].result,
145
- 'NER Chunk': [n.result for n in output[0]['ner_chunk']],
146
- "NER Label": [n.metadata['entity'] for n in output[0]['ner_chunk']]
147
- }
148
-
149
- annotate(results)
150
-
151
- with st.expander("View DataFrame"):
152
- df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']})
153
- df.index += 1
154
- st.dataframe(df)
155
-
156
-
157
-
158
-
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ import pandas as pd
5
+
6
+ from sparknlp.base import *
7
+ from sparknlp.annotator import *
8
+ from pyspark.ml import Pipeline
9
+ from sparknlp.pretrained import PretrainedPipeline
10
+ from annotated_text import annotated_text
11
+
12
+ # Page configuration
13
+ st.set_page_config(
14
+ layout="wide",
15
+ initial_sidebar_state="auto"
16
+ )
17
+
18
+ # CSS for styling
19
+ st.markdown("""
20
+ <style>
21
+ .main-title {
22
+ font-size: 36px;
23
+ color: #4A90E2;
24
+ font-weight: bold;
25
+ text-align: center;
26
+ }
27
+ .section {
28
+ background-color: #f9f9f9;
29
+ padding: 10px;
30
+ border-radius: 10px;
31
+ margin-top: 10px;
32
+ }
33
+ .section p, .section ul {
34
+ color: #666666;
35
+ }
36
+ </style>
37
+ """, unsafe_allow_html=True)
38
+
39
+ @st.cache_resource
40
+ def init_spark():
41
+ return sparknlp.start()
42
+
43
+ @st.cache_resource
44
+ def create_pipeline(model):
45
+ document_assembler = DocumentAssembler() \
46
+ .setInputCol("text") \
47
+ .setOutputCol("document")
48
+
49
+ sentence_detector = SentenceDetector() \
50
+ .setInputCols(["document"]) \
51
+ .setOutputCol("sentence")
52
+
53
+ word_segmenter = WordSegmenterModel.pretrained("wordseg_kaist_ud", "ko") \
54
+ .setInputCols(["sentence"]) \
55
+ .setOutputCol("token")
56
+
57
+ embeddings = WordEmbeddingsModel.pretrained("glove_840B_300", "xx") \
58
+ .setInputCols(["document", "token"]) \
59
+ .setOutputCol("embeddings")
60
+
61
+ ner = NerDLModel.pretrained("ner_kmou_glove_840B_300d", "ko") \
62
+ .setInputCols(["document", "token", "embeddings"]) \
63
+ .setOutputCol("ner")
64
+
65
+ ner_converter = NerConverter().setInputCols(["document", "token", "ner"]).setOutputCol("ner_chunk")
66
+
67
+ pipeline = Pipeline(stages=[document_assembler, sentence_detector, word_segmenter, embeddings, ner, ner_converter])
68
+ return pipeline
69
+
70
+ def fit_data(pipeline, data):
71
+ empty_df = spark.createDataFrame([['']]).toDF('text')
72
+ pipeline_model = pipeline.fit(empty_df)
73
+ model = LightPipeline(pipeline_model)
74
+ result = model.fullAnnotate(data)
75
+ return result
76
+
77
+ def annotate(data):
78
+ document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
79
+ annotated_words = []
80
+ for chunk, label in zip(chunks, labels):
81
+ parts = document.split(chunk, 1)
82
+ if parts[0]:
83
+ annotated_words.append(parts[0])
84
+ annotated_words.append((chunk, label))
85
+ document = parts[1]
86
+ if document:
87
+ annotated_words.append(document)
88
+ annotated_text(*annotated_words)
89
+
90
+ # Set up the page layout
91
+ st.markdown('<div class="main-title">Recognize entities in Urdu text</div>', unsafe_allow_html=True)
92
+ st.markdown("""
93
+ <div class="section">
94
+ <p>This model uses the pre-trained <code>glove_840B_300</code> embeddings model from WordEmbeddings annotator as an input</p>
95
+ </div>
96
+ """, unsafe_allow_html=True)
97
+
98
+ # Sidebar content
99
+ model = st.sidebar.selectbox(
100
+ "Choose the pretrained model",
101
+ ["ner_kmou_glove_840B_300d"],
102
+ help="For more info about the models visit: https://sparknlp.org/models"
103
+ )
104
+
105
+ # Reference notebook link in sidebar
106
+ link = """
107
+ <a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/public/NER_KO.ipynb">
108
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
109
+ </a>
110
+ """
111
+ st.sidebar.markdown('Reference notebook:')
112
+ st.sidebar.markdown(link, unsafe_allow_html=True)
113
+
114
+ # Load examples
115
+ examples = [
116
+ """ARD , ZDF λ“± 곡영 TV 와 바이에λ₯Έμ£Ό 방솑 , 뢁뢀 독일 방솑 λ“± 은 이 λ‚  ν•œκ΅­ 의 총선 μ†Œμ‹ κ³Ό κ΄€λ ¨ , μ—¬λ‹Ή 의 과반수 μ˜μ„ 확보 와 μ‹ λ‹Ή 의 λ“ν‘œ 율 이 이번 μ„ κ±° 의 μ΅œλŒ€ 관심사 이 라고 보도 ν•˜ γ„΄ 데 μž‡ μ–΄ 저녁 μ‹œκ°„ λΆ€ν„° λŠ” 수 μ°¨λ‘€ 에 걸치 μ–΄ κ°œν‘œ 상황 κ³Ό μ •λ‹Ή 별 μ˜μ„ 전망 을 속보 둜 μ „ν•˜ μ•˜ λ‹€ .""",
117
+ """두 λ‚˜λΌ 관계 λŠ” 쀑ꡭ 의 인ꢌ 문제 와 ν•΅ν™•μ‚° λ°©μ§€ 문제 , 톡상 문제 및 졜근 의 F 16 μ „νˆ¬κΈ° λŒ€ λŒ€λ§Œ 판맀 λ“± 을 놓 κ³  이미 μœ„ν—˜μ„  상 에 였 μ•„ 있 λŠ”λ° 클린턴 ν–‰μ •λΆ€ 의 λ“±μž₯ 으둜 μ–‘κ΅­ 관계 κ°€ λ”μš± 경색 되 γ„Ή 것 을 κ±±μ • ν•˜ λŠ” λΆ„μœ„κΈ° .""",
118
+ """μ„œμšΈλŒ€ 건좕곡학 κ³Ό λ₯Ό μ‘Έμ—… ν•˜ γ„΄ 이 씨 λŠ” ν•œκ΅­κ±΄μΆ•κ°€ν˜‘νšŒ""",
119
+ """λ‚˜ λŠ” λ‹€μ‹œ 순자 λ₯Ό 양동 μ—μ„œ λΉΌλ‚΄ κΈ° μœ„ν•˜ μ•„μ„œ μ°½μ‹  νŒ”λ™""",
120
+ """헀라신전 μ„œ 채화 되 γ„΄ μ§€ 보름 , 지ꡬ 의 λ°˜λ°”ν€΄ λ₯Ό 돌 μ•„ μ œμ£Όκ³΅ν•­ 에 첫발 을 λ‚΄λ””λ”” γ„΄ 이래 둜 열이틀""",
121
+ """λ‹€μŒ 은 홍콩 의 κΆŒμœ„μ§€ λͺ…보 와 일본 도쿄 ( 동경 ) μ‹ λ¬Έ 이 24일""",
122
+ """졜 μ˜μ‚¬ κ°€ 우리 외ꡐ관 이 λ©° κ·Έ μ‹ λ³€λ³΄ν˜Έ μ±…μž„ 이 주재ꡭ 이 γ„΄ λŸ¬μ‹œμ•„ 에 있 λ‹€λŠ” 점 μ—μ„œ λŸ¬μ‹œμ•„ λŠ” 이 κ°™ 은 우리 μ •λΆ€ μš”κ΅¬ 에 μ‘ν•˜ μ•„μ•Ό ν•˜ γ„Ή 의무 κ°€ 있 λ‹€ .""",
123
+ """판 에 λ°• 은 λ“― ν•˜ γ„΄ 깨끗 ν•˜ γ„΄ 글씨 둜 , 처음 단ꡰ λ‹˜ 이 λ‹ˆ 신라 , 백제 , 고ꡬ렀 이 λ‹ˆ 띄엄띄엄 μ–΄λ₯Έ λ“€ ν•œν…Œ μ„œ κ·€κ²° 둜 λ“€μ–΄μ˜€ 던 μ–˜κΈ° λ“€ 이 참말 둜 μ”Œ μ–΄ 있 μ—ˆ λ‹€ ."""
124
+ ]
125
+
126
+ selected_text = st.selectbox("Select an example", examples)
127
+ custom_input = st.text_input("Try it with your own Sentence!")
128
+
129
+ text_to_analyze = custom_input if custom_input else selected_text
130
+
131
+ st.subheader('Full example text')
132
+ HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
133
+ st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
134
+
135
+ # Initialize Spark and create pipeline
136
+ spark = init_spark()
137
+ pipeline = create_pipeline(model)
138
+ output = fit_data(pipeline, text_to_analyze)
139
+
140
+ # Display matched sentence
141
+ st.subheader("Processed output:")
142
+
143
+ results = {
144
+ 'Document': output[0]['document'][0].result,
145
+ 'NER Chunk': [n.result for n in output[0]['ner_chunk']],
146
+ "NER Label": [n.metadata['entity'] for n in output[0]['ner_chunk']]
147
+ }
148
+
149
+ annotate(results)
150
+
151
+ with st.expander("View DataFrame"):
152
+ df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']})
153
+ df.index += 1
154
+ st.dataframe(df)
155
+
156
+
157
+
158
+