Update Demo.py
Browse files
Demo.py
CHANGED
@@ -2,15 +2,12 @@ import streamlit as st
|
|
2 |
import sparknlp
|
3 |
import pandas as pd
|
4 |
import json
|
5 |
-
import os
|
6 |
|
7 |
from sparknlp.base import *
|
8 |
from sparknlp.annotator import *
|
9 |
from pyspark.ml import Pipeline
|
10 |
from sparknlp.pretrained import PretrainedPipeline
|
11 |
|
12 |
-
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3.8'
|
13 |
-
|
14 |
# Page configuration
|
15 |
st.set_page_config(
|
16 |
layout="wide",
|
@@ -40,13 +37,7 @@ st.markdown("""
|
|
40 |
|
41 |
@st.cache_resource
|
42 |
def init_spark():
|
43 |
-
|
44 |
-
spark = SparkSession.builder \
|
45 |
-
.appName("App") \
|
46 |
-
.config("spark.pyspark.python", "/usr/bin/python3.8") \
|
47 |
-
.config("spark.pyspark.driver.python", "/usr/bin/python3.8") \
|
48 |
-
.getOrCreate()
|
49 |
-
return spark
|
50 |
|
51 |
@st.cache_resource
|
52 |
def create_pipeline(model):
|
@@ -75,11 +66,20 @@ def create_pipeline(model):
|
|
75 |
pipeline = Pipeline(stages=[document_assembler, sentence_detector, table_assembler, tapas_wtq, tapas_sqa])
|
76 |
return pipeline
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
def fit_data(pipeline, json_data, question):
|
79 |
spark_df = spark.createDataFrame([[json_data, question]]).toDF("table_json", "questions")
|
80 |
model = pipeline.fit(spark_df)
|
81 |
-
|
82 |
-
|
|
|
|
|
83 |
|
84 |
# Sidebar content
|
85 |
model = st.sidebar.selectbox(
|
@@ -90,19 +90,19 @@ model = st.sidebar.selectbox(
|
|
90 |
|
91 |
# Set up the page layout
|
92 |
title = 'TAPAS for Table-Based Question Answering with Spark NLP'
|
93 |
-
sub_title = (
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
)
|
99 |
|
100 |
st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
|
101 |
st.markdown(f'<div class="section"><p>{sub_title}</p></div>', unsafe_allow_html=True)
|
102 |
|
103 |
# Reference notebook link in sidebar
|
104 |
link = """
|
105 |
-
<a href="https://
|
106 |
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
|
107 |
</a>
|
108 |
"""
|
@@ -149,7 +149,6 @@ queries = [
|
|
149 |
"How many billionaires are in the conglomerate industry?"
|
150 |
]
|
151 |
|
152 |
-
|
153 |
# Load the JSON data into a DataFrame and display it
|
154 |
table_data = json.loads(json_data)
|
155 |
df_table = pd.DataFrame(table_data["rows"], columns=table_data["header"])
|
@@ -180,5 +179,12 @@ output = fit_data(pipeline, table_json_str, text_to_analyze)
|
|
180 |
|
181 |
# Display the output
|
182 |
st.markdown("---")
|
183 |
-
st.subheader("Processed
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import sparknlp
|
3 |
import pandas as pd
|
4 |
import json
|
|
|
5 |
|
6 |
from sparknlp.base import *
|
7 |
from sparknlp.annotator import *
|
8 |
from pyspark.ml import Pipeline
|
9 |
from sparknlp.pretrained import PretrainedPipeline
|
10 |
|
|
|
|
|
11 |
# Page configuration
|
12 |
st.set_page_config(
|
13 |
layout="wide",
|
|
|
37 |
|
38 |
@st.cache_resource
|
39 |
def init_spark():
|
40 |
+
return sparknlp.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
@st.cache_resource
|
43 |
def create_pipeline(model):
|
|
|
66 |
pipeline = Pipeline(stages=[document_assembler, sentence_detector, table_assembler, tapas_wtq, tapas_sqa])
|
67 |
return pipeline
|
68 |
|
69 |
+
def fit_data(pipeline, data):
|
70 |
+
empty_df = spark.createDataFrame([['']]).toDF('text')
|
71 |
+
pipeline_model = pipeline.fit(empty_df)
|
72 |
+
model = LightPipeline(pipeline_model)
|
73 |
+
result = model.fullAnnotate(data)
|
74 |
+
return result
|
75 |
+
|
76 |
def fit_data(pipeline, json_data, question):
|
77 |
spark_df = spark.createDataFrame([[json_data, question]]).toDF("table_json", "questions")
|
78 |
model = pipeline.fit(spark_df)
|
79 |
+
lightPipelineModel = LightPipeline(model)
|
80 |
+
result = lightPipelineModel.fullAnnotate(data)
|
81 |
+
st.write(result)
|
82 |
+
return result
|
83 |
|
84 |
# Sidebar content
|
85 |
model = st.sidebar.selectbox(
|
|
|
90 |
|
91 |
# Set up the page layout
|
92 |
title = 'TAPAS for Table-Based Question Answering with Spark NLP'
|
93 |
+
sub_title = ("""
|
94 |
+
TAPAS (Table Parsing Supervised via Pre-trained Language Models) enhances the BERT architecture to effectively process tabular data, allowing it to answer complex questions about tables without needing to convert them into text.<br>
|
95 |
+
<br>
|
96 |
+
<strong>table_qa_tapas_base_finetuned_wtq:</strong> This model excels at answering questions that require aggregating data across the entire table, such as calculating sums or averages.<br>
|
97 |
+
<strong>table_qa_tapas_base_finetuned_sqa:</strong> This model is designed for sequential question-answering tasks where the answer to each question may depend on the context provided by previous answers.
|
98 |
+
""")
|
99 |
|
100 |
st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
|
101 |
st.markdown(f'<div class="section"><p>{sub_title}</p></div>', unsafe_allow_html=True)
|
102 |
|
103 |
# Reference notebook link in sidebar
|
104 |
link = """
|
105 |
+
<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/NER_HINDI_ENGLISH.ipynb">
|
106 |
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
|
107 |
</a>
|
108 |
"""
|
|
|
149 |
"How many billionaires are in the conglomerate industry?"
|
150 |
]
|
151 |
|
|
|
152 |
# Load the JSON data into a DataFrame and display it
|
153 |
table_data = json.loads(json_data)
|
154 |
df_table = pd.DataFrame(table_data["rows"], columns=table_data["header"])
|
|
|
179 |
|
180 |
# Display the output
|
181 |
st.markdown("---")
|
182 |
+
st.subheader("Processed Output")
|
183 |
+
|
184 |
+
# # Check if output is available
|
185 |
+
# if output:
|
186 |
+
# results_wtq = output[0][0] if output[0][0] else "No results found."
|
187 |
+
# results_sqa = output[0][1] if output[0][1] else "No results found."
|
188 |
+
# st.markdown(f"**Answers from WTQ model:** {', '.join(results_wtq)}")
|
189 |
+
# st.markdown(f"**Answers from SQA model:** {', '.join(results_sqa)}")
|
190 |
+
|