abdullahmubeen10 commited on
Commit
c309169
·
verified ·
1 Parent(s): 8bc0ff0

Upload 5 files

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ primaryColor="#29B4E8"
Demo.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import pandas as pd
4
+ import json
5
+
6
+ from sparknlp.base import *
7
+ from sparknlp.annotator import *
8
+ from pyspark.ml import Pipeline
9
+ from sparknlp.pretrained import PretrainedPipeline
10
+
11
+ # Page configuration
12
+ st.set_page_config(
13
+ layout="wide",
14
+ initial_sidebar_state="auto"
15
+ )
16
+
17
+ # CSS for styling
18
+ st.markdown("""
19
+ <style>
20
+ .main-title {
21
+ font-size: 36px;
22
+ color: #4A90E2;
23
+ font-weight: bold;
24
+ text-align: center;
25
+ }
26
+ .section {
27
+ background-color: #f9f9f9;
28
+ padding: 10px;
29
+ border-radius: 10px;
30
+ margin-top: 10px;
31
+ }
32
+ .section p, .section ul {
33
+ color: #666666;
34
+ }
35
+ </style>
36
+ """, unsafe_allow_html=True)
37
+
38
+ @st.cache_resource
39
+ def init_spark():
40
+ return sparknlp.start()
41
+
42
+ @st.cache_resource
43
+ def create_pipeline(model):
44
+ document_assembler = MultiDocumentAssembler() \
45
+ .setInputCols("table_json", "questions") \
46
+ .setOutputCols("document_table", "document_questions")
47
+
48
+ sentence_detector = SentenceDetector() \
49
+ .setInputCols(["document_questions"]) \
50
+ .setOutputCol("questions")
51
+
52
+ table_assembler = TableAssembler()\
53
+ .setInputCols(["document_table"])\
54
+ .setOutputCol("table")
55
+
56
+ tapas_wtq = TapasForQuestionAnswering\
57
+ .pretrained("table_qa_tapas_base_finetuned_wtq", "en")\
58
+ .setInputCols(["questions", "table"])\
59
+ .setOutputCol("answers_wtq")
60
+
61
+ tapas_sqa = TapasForQuestionAnswering\
62
+ .pretrained("table_qa_tapas_base_finetuned_sqa", "en")\
63
+ .setInputCols(["questions", "table"])\
64
+ .setOutputCol("answers_sqa")
65
+
66
+ pipeline = Pipeline(stages=[document_assembler, sentence_detector, table_assembler, tapas_wtq, tapas_sqa])
67
+ return pipeline
68
+
69
+ def fit_data(pipeline, json_data, question):
70
+ spark_df = spark.createDataFrame([[json_data, question]]).toDF("table_json", "questions")
71
+ model = pipeline.fit(spark_df)
72
+ result = model.transform(spark_df)
73
+ return result.select("answers_wtq.result", "answers_sqa.result").collect()
74
+
75
+ # Sidebar content
76
+ model = st.sidebar.selectbox(
77
+ "Choose the pretrained model",
78
+ ["table_qa_tapas_base_finetuned_wtq", "table_qa_tapas_base_finetuned_sqa"],
79
+ help="For more info about the models visit: https://sparknlp.org/models"
80
+ )
81
+
82
+ # Set up the page layout
83
+ title = 'TAPAS for Table-Based Question Answering with Spark NLP'
84
+ sub_title = (
85
+ 'TAPAS (Table Parsing Supervised via Pre-trained Language Models) is a model that extends '
86
+ 'the BERT architecture to handle tabular data. Unlike traditional models that require flattening '
87
+ 'tables into text, TAPAS can directly interpret tables, making it a powerful tool for answering '
88
+ 'questions that involve tabular data.'
89
+ )
90
+
91
+ st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
92
+ st.markdown(f'<div class="section"><p>{sub_title}</p></div>', unsafe_allow_html=True)
93
+
94
+ # Reference notebook link in sidebar
95
+ link = """
96
+ <a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/15.1_Table_Question_Answering.ipynb">
97
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
98
+ </a>
99
+ """
100
+ st.sidebar.markdown('Reference notebook:')
101
+ st.sidebar.markdown(link, unsafe_allow_html=True)
102
+
103
+ # Define the JSON data for the table
104
+ # New JSON data
105
+ json_data = '''
106
+ {
107
+ "header": ["name", "net_worth", "age", "nationality", "company", "industry"],
108
+ "rows": [
109
+ ["Elon Musk", "$200,000,000,000", "52", "American", "Tesla, SpaceX", "Automotive, Aerospace"],
110
+ ["Jeff Bezos", "$150,000,000,000", "60", "American", "Amazon", "E-commerce"],
111
+ ["Bernard Arnault", "$210,000,000,000", "74", "French", "LVMH", "Luxury Goods"],
112
+ ["Bill Gates", "$120,000,000,000", "68", "American", "Microsoft", "Technology"],
113
+ ["Warren Buffett", "$110,000,000,000", "93", "American", "Berkshire Hathaway", "Conglomerate"],
114
+ ["Larry Page", "$100,000,000,000", "51", "American", "Google", "Technology"],
115
+ ["Mark Zuckerberg", "$85,000,000,000", "40", "American", "Meta", "Social Media"],
116
+ ["Mukesh Ambani", "$80,000,000,000", "67", "Indian", "Reliance Industries", "Conglomerate"],
117
+ ["Alice Walton", "$65,000,000,000", "74", "American", "Walmart", "Retail"],
118
+ ["Francoise Bettencourt Meyers", "$70,000,000,000", "70", "French", "L'Oreal", "Cosmetics"],
119
+ ["Amancio Ortega", "$75,000,000,000", "88", "Spanish", "Inditex (Zara)", "Retail"],
120
+ ["Carlos Slim", "$55,000,000,000", "84", "Mexican", "America Movil", "Telecom"]
121
+ ]
122
+ }
123
+ '''
124
+
125
+ # Define queries for selection
126
+ queries = [
127
+ "Who has a higher net worth, Bernard Arnault or Jeff Bezos?",
128
+ "List the top three individuals by net worth.",
129
+ "Who is the richest person in the technology industry?",
130
+ "Which company in the e-commerce industry has the highest net worth?",
131
+ "Who is the oldest billionaire on the list?",
132
+ "Which individual under the age of 60 has the highest net worth?",
133
+ "Who is the wealthiest American, and which company do they own?",
134
+ "Find all French billionaires and list their companies.",
135
+ "How many women are on the list, and what are their total net worths?",
136
+ "Who is the wealthiest non-American on the list?",
137
+ "Find the person who is the youngest and has a net worth over $100 billion.",
138
+ "Who owns companies in more than one industry, and what are those industries?",
139
+ "What is the total net worth of all individuals over 70?",
140
+ "How many billionaires are in the conglomerate industry?"
141
+ ]
142
+
143
+
144
+ # Load the JSON data into a DataFrame and display it
145
+ table_data = json.loads(json_data)
146
+ df_table = pd.DataFrame(table_data["rows"], columns=table_data["header"])
147
+ df_table.index += 1
148
+
149
+ st.write("")
150
+ st.write("Context DataFrame (Click To Edit)")
151
+ edited_df = st.data_editor(df_table)
152
+
153
+ # Convert edited DataFrame back to JSON format
154
+ table_json_data = {
155
+ "header": edited_df.columns.tolist(),
156
+ "rows": edited_df.values.tolist()
157
+ }
158
+ table_json_str = json.dumps(table_json_data)
159
+
160
+ # User input for questions
161
+ selected_text = st.selectbox("Question Query", queries)
162
+ custom_input = st.text_input("Try it with your own Question!")
163
+ text_to_analyze = custom_input if custom_input else selected_text
164
+
165
+ # Initialize Spark and create the pipeline
166
+ spark = init_spark()
167
+ pipeline = create_pipeline(model)
168
+
169
+ # Run the pipeline with the selected query and the converted table data
170
+ output = fit_data(pipeline, table_json_str, text_to_analyze)
171
+
172
+ # Display the output
173
+ st.markdown("---")
174
+ st.subheader("Processed output:")
175
+ st.write("**Answer:**", ', '.join(output[0][0]))
Dockerfile ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Download base image ubuntu 18.04
2
+ FROM ubuntu:18.04
3
+
4
+ # Set environment variables
5
+ ENV NB_USER jovyan
6
+ ENV NB_UID 1000
7
+ ENV HOME /home/${NB_USER}
8
+
9
+ # Install required packages
10
+ RUN apt-get update && apt-get install -y \
11
+ tar \
12
+ wget \
13
+ bash \
14
+ rsync \
15
+ gcc \
16
+ libfreetype6-dev \
17
+ libhdf5-serial-dev \
18
+ libpng-dev \
19
+ libzmq3-dev \
20
+ python3 \
21
+ python3-dev \
22
+ python3-pip \
23
+ unzip \
24
+ pkg-config \
25
+ software-properties-common \
26
+ graphviz \
27
+ openjdk-8-jdk \
28
+ ant \
29
+ ca-certificates-java \
30
+ && apt-get clean \
31
+ && update-ca-certificates -f;
32
+
33
+ # Install Python 3.8 and pip
34
+ RUN add-apt-repository ppa:deadsnakes/ppa \
35
+ && apt-get update \
36
+ && apt-get install -y python3.8 python3-pip \
37
+ && apt-get clean;
38
+
39
+ # Set up JAVA_HOME
40
+ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
41
+ RUN mkdir -p ${HOME} \
42
+ && echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
43
+ && chown -R ${NB_UID}:${NB_UID} ${HOME}
44
+
45
+ # Create a new user named "jovyan" with user ID 1000
46
+ RUN useradd -m -u ${NB_UID} ${NB_USER}
47
+
48
+ # Switch to the "jovyan" user
49
+ USER ${NB_USER}
50
+
51
+ # Set home and path variables for the user
52
+ ENV HOME=/home/${NB_USER} \
53
+ PATH=/home/${NB_USER}/.local/bin:$PATH
54
+
55
+ # Set the working directory to the user's home directory
56
+ WORKDIR ${HOME}
57
+
58
+ # Upgrade pip and install Python dependencies
59
+ RUN python3.8 -m pip install --upgrade pip
60
+ COPY requirements.txt /tmp/requirements.txt
61
+ RUN python3.8 -m pip install -r /tmp/requirements.txt
62
+
63
+ # Copy the application code into the container at /home/jovyan
64
+ COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
65
+
66
+ # Expose port for Streamlit
67
+ EXPOSE 7860
68
+
69
+ # Define the entry point for the container
70
+ ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
pages/Workflow & Model Overview.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ # Custom CSS for better styling
5
+ st.markdown("""
6
+ <style>
7
+ .main-title {
8
+ font-size: 36px;
9
+ color: #4A90E2;
10
+ font-weight: bold;
11
+ text-align: center;
12
+ }
13
+ .sub-title {
14
+ font-size: 24px;
15
+ color: #4A90E2;
16
+ margin-top: 20px;
17
+ }
18
+ .section {
19
+ background-color: #f9f9f9;
20
+ padding: 15px;
21
+ border-radius: 10px;
22
+ margin-top: 20px;
23
+ }
24
+ .section p, .section ul {
25
+ color: #666666;
26
+ }
27
+ .link {
28
+ color: #4A90E2;
29
+ text-decoration: none;
30
+ }
31
+ h2 {
32
+ color: #4A90E2;
33
+ font-size: 28px;
34
+ font-weight: bold;
35
+ margin-top: 30px;
36
+ }
37
+ h3 {
38
+ color: #4A90E2;
39
+ font-size: 22px;
40
+ font-weight: bold;
41
+ margin-top: 20px;
42
+ }
43
+ h4 {
44
+ color: #4A90E2;
45
+ font-size: 18px;
46
+ font-weight: bold;
47
+ margin-top: 15px;
48
+ }
49
+ </style>
50
+ """, unsafe_allow_html=True)
51
+
52
+ # Main Title
53
+ st.markdown('<div class="main-title">Question Answering Over Tables with TAPAS and Spark NLP</div>', unsafe_allow_html=True)
54
+
55
+ # Overview Section
56
+ st.markdown("""
57
+ <div class="section">
58
+ <p>As data becomes increasingly complex, extracting meaningful insights from tabular data is more important than ever. TAPAS, a transformer-based model developed by Google, is designed specifically to handle question-answering over tables. By combining TAPAS with Spark NLP, we can leverage the power of distributed computing to process large datasets efficiently.</p>
59
+ <p>This guide will walk you through the process of setting up TAPAS in Spark NLP, implementing two specific models (<code>table_qa_tapas_base_finetuned_wtq</code> and <code>table_qa_tapas_base_finetuned_sqa</code>), and understanding their best use cases.</p>
60
+ </div>
61
+ """, unsafe_allow_html=True)
62
+
63
+ # Introduction to TAPAS and Spark NLP
64
+ st.markdown('<div class="sub-title">Introduction to TAPAS and Spark NLP</div>', unsafe_allow_html=True)
65
+
66
+ # What is TAPAS?
67
+ st.markdown("""
68
+ <div class="section">
69
+ <h3>What is TAPAS?</h3>
70
+ <p>TAPAS (Table Parsing Supervised via Pre-trained Language Models) is a model that extends the BERT architecture to handle tabular data. Unlike traditional models that require flattening tables into text, TAPAS can directly interpret tables, making it a powerful tool for answering questions that involve tabular data.</p>
71
+ </div>
72
+ """, unsafe_allow_html=True)
73
+
74
+ # Why Use TAPAS with Spark NLP?
75
+ st.markdown("""
76
+ <div class="section">
77
+ <h3>Why Use TAPAS with Spark NLP?</h3>
78
+ <p>Spark NLP, developed by John Snow Labs, is an open-source library that provides state-of-the-art natural language processing capabilities within a distributed computing framework. Integrating TAPAS with Spark NLP allows you to scale your question-answering tasks across large datasets, making it ideal for big data environments.</p>
79
+ </div>
80
+ """, unsafe_allow_html=True)
81
+
82
+ # Pipeline and Results
83
+ st.markdown('<div class="sub-title">Pipeline and Results</div>', unsafe_allow_html=True)
84
+
85
+ st.markdown("""
86
+ <div class="section">
87
+ <p>In this section, we’ll build a pipeline using Spark NLP to process a table and answer questions about the data it contains. We will utilize two different TAPAS models, each suited for different types of queries.</p>
88
+ </div>
89
+ """, unsafe_allow_html=True)
90
+
91
+ # Step 1: Creating the Data
92
+ st.markdown("""
93
+ <div class="section">
94
+ <h4>Step 1: Creating the Data</h4>
95
+ <p>We'll start by creating a Spark DataFrame that includes a table in JSON format and a set of questions.</p>
96
+ """, unsafe_allow_html=True)
97
+
98
+ st.code("""
99
+ json_data = '''
100
+ {
101
+ "header": ["name", "money", "age"],
102
+ "rows": [
103
+ ["Donald Trump", "$100,000,000", "75"],
104
+ ["Elon Musk", "$20,000,000,000,000", "55"]
105
+ ]
106
+ }
107
+ '''
108
+
109
+ queries = [
110
+ "Who earns less than 200,000,000?",
111
+ "Who earns 100,000,000?",
112
+ "How much money has Donald Trump?",
113
+ "How old are they?",
114
+ "How much money have they total?",
115
+ "Who earns more than Donald Trump?"
116
+ ]
117
+
118
+ data = spark.createDataFrame([[json_data, " ".join(queries)]])\\
119
+ .toDF("table_json", "questions")
120
+ """, language="python")
121
+
122
+ # Step 2: Assembling the Pipeline
123
+ st.markdown("""
124
+ <div class="section">
125
+ <h4>Step 2: Assembling the Pipeline</h4>
126
+ <p>We will now set up a Spark NLP pipeline that includes the necessary annotators for processing the table and questions.</p>
127
+ """, unsafe_allow_html=True)
128
+
129
+ st.code("""
130
+ from sparknlp.annotator import TapasForQuestionAnswering, SentenceDetector
131
+ from sparknlp.base import MultiDocumentAssembler, TableAssembler
132
+ from pyspark.ml import Pipeline
133
+ from pyspark.sql import functions as F
134
+
135
+ # Step 1: Transforms raw texts to `document` annotation
136
+ document_assembler = MultiDocumentAssembler() \\
137
+ .setInputCols("table_json", "questions") \\
138
+ .setOutputCols("document_table", "document_questions")
139
+
140
+ # Step 2: Getting the sentences
141
+ sentence_detector = SentenceDetector() \\
142
+ .setInputCols(["document_questions"]) \\
143
+ .setOutputCol("questions")
144
+
145
+ # Step 3: Get the tables
146
+ table_assembler = TableAssembler()\\
147
+ .setInputCols(["document_table"])\\
148
+ .setOutputCol("table")
149
+
150
+ # WTQ TAPAS model
151
+ tapas_wtq = TapasForQuestionAnswering\\
152
+ .pretrained("table_qa_tapas_base_finetuned_wtq", "en")\\
153
+ .setInputCols(["questions", "table"])\\
154
+ .setOutputCol("answers_wtq")
155
+
156
+ # SQA TAPAS model
157
+ tapas_sqa = TapasForQuestionAnswering\\
158
+ .pretrained("table_qa_tapas_base_finetuned_sqa", "en")\\
159
+ .setInputCols(["questions", "table"])\\
160
+ .setOutputCol("answers_sqa")
161
+
162
+ # Define pipeline
163
+ pipeline = Pipeline(stages=[
164
+ document_assembler,
165
+ sentence_detector,
166
+ table_assembler,
167
+ tapas_wtq,
168
+ tapas_sqa
169
+ ])
170
+
171
+ # Fit and transform data
172
+ model = pipeline.fit(data)
173
+ result = model.transform(data)
174
+ """, language="python")
175
+
176
+ # Step 3: Viewing the Results
177
+ st.markdown("""
178
+ <div class="section">
179
+ <h4>Step 3: Viewing the Results</h4>
180
+ <p>After processing, we can explore the results generated by each model:</p>
181
+ """, unsafe_allow_html=True)
182
+
183
+ st.code("""
184
+ # WTQ Model Results:
185
+ result.select(F.explode(result.answers_wtq)).show(truncate=False)
186
+ """, language="python")
187
+
188
+ st.text("""
189
+ +--------------------------------------+
190
+ |col |
191
+ +--------------------------------------+
192
+ |Donald Trump |
193
+ |Donald Trump |
194
+ |SUM($100,000,000) |
195
+ |AVERAGE(75, 55) |
196
+ |SUM($100,000,000, $20,000,000,000,000)|
197
+ |Elon Musk |
198
+ +--------------------------------------+
199
+ """)
200
+
201
+ st.code("""
202
+ # SQA Model Results:
203
+ result.select(F.explode(result.answers_sqa)).show(truncate=False)
204
+ """, language="python")
205
+
206
+ st.text("""
207
+ +---------------------------------+
208
+ |col |
209
+ +---------------------------------+
210
+ |Donald Trump |
211
+ |Donald Trump |
212
+ |$100,000,000 |
213
+ |75, 55 |
214
+ |$100,000,000, $20,000,000,000,000|
215
+ |Elon Musk |
216
+ +---------------------------------+
217
+ """)
218
+
219
+ # Comparing Results
220
+ st.markdown("""
221
+ <div class="section">
222
+ <h4>Comparing Results</h4>
223
+ <p>To better understand the differences, we can compare the results from both models side by side:</p>
224
+ """, unsafe_allow_html=True)
225
+
226
+ st.code("""
227
+ result.select(F.explode(F.arrays_zip(result.questions.result,
228
+ result.answers_sqa.result,
229
+ result.answers_wtq.result)).alias("cols"))\\
230
+ .select(F.expr("cols['0']").alias("question"),
231
+ F.expr("cols['1']").alias("answer_sqa"),
232
+ F.expr("cols['2']").alias("answer_wtq")).show(truncate=False)
233
+ """, language="python")
234
+
235
+ st.text("""
236
+ +---------------------------------+---------------------------------+--------------------------------------+
237
+ |question |answer_sqa |answer_wtq |
238
+ +---------------------------------+---------------------------------+--------------------------------------+
239
+ |Who earns less than 200,000,000? |Donald Trump |Donald Trump |
240
+ |Who earns 100,000,000? |Donald Trump |Donald Trump |
241
+ |How much money has Donald Trump? |$100,000,000 |SUM($100,000,000) |
242
+ |How old are they? |75, 55 |AVERAGE(75, 55) |
243
+ |How much money have they total? |$100,000,000, $20,000,000,000,000|SUM($100,000,000, $20,000,000,000,000)|
244
+ |Who earns more than Donald Trump?|Elon Musk |Elon Musk |
245
+ +---------------------------------+---------------------------------+--------------------------------------+
246
+ """)
247
+
248
+ # One-Liner Alternative
249
+ st.markdown("""
250
+ <div class="section">
251
+ <h4>One-Liner Alternative</h4>
252
+ <p>For those who prefer a simpler approach, John Snow Labs offers a one-liner API to quickly get answers using TAPAS models.</p>
253
+ """, unsafe_allow_html=True)
254
+
255
+ st.code("""
256
+ #Downliad the johnsnowlabs library
257
+ pip install johnsnowlabs
258
+ """, language="bash")
259
+
260
+ st.code("""
261
+ import pandas as pd
262
+ from johnsnowlabs import nlp
263
+
264
+ # Create the context DataFrame
265
+ context_df = pd.DataFrame({
266
+ 'name': ['Donald Trump', 'Elon Musk'],
267
+ 'money': ['$100,000,000', '$20,000,000,000,000'],
268
+ 'age': ['75', '55']
269
+ })
270
+
271
+ # Define the questions
272
+ questions = [
273
+ "Who earns less than 200,000,000?",
274
+ "Who earns 100,000,000?",
275
+ "How much money has Donald Trump?",
276
+ "How old are they?",
277
+ "How much money have they total?",
278
+ "Who earns more than Donald Trump?"
279
+ ]
280
+
281
+ # Combine context and questions into a tuple
282
+ tapas_data = (context_df, questions)
283
+
284
+ # Use the one-liner API with the WTQ model
285
+ answers_wtq = nlp.load('en.answer_question.tapas.wtq.large_finetuned').predict(tapas_data)
286
+ answers_wtq[['sentence', 'tapas_qa_UNIQUE_answer']]
287
+ """, language="python")
288
+
289
+ # Define the data as a list of dictionaries
290
+ data = {
291
+ "sentence": [
292
+ "Who earns less than 200,000,000?",
293
+ "Who earns 100,000,000?",
294
+ "How much money has Donald Trump?",
295
+ "How old are they?",
296
+ "How much money have they total? Who earns more..."
297
+ ],
298
+ "tapas_qa_UNIQUE_answer": [
299
+ "Donald Trump",
300
+ "Donald Trump",
301
+ "SUM($100,000,000)",
302
+ "SUM(55)",
303
+ "SUM($20,000,000,000,000)"
304
+ ]
305
+ }
306
+ st.dataframe(pd.DataFrame(data))
307
+
308
+ # Model Information and Use Cases
309
+ st.markdown("""
310
+ <div class="section">
311
+ <h4>Model Information and Use Cases</h4>
312
+ <p>Understanding the strengths of each TAPAS model can help you choose the right tool for your task.</p>
313
+ <ul>
314
+ <li><b>table_qa_tapas_base_finetuned_wtq</b></li>
315
+ <ul>
316
+ <li>Best for: answering questions involving table-wide aggregation (e.g., sums, averages).</li>
317
+ </ul>
318
+ <li><b>table_qa_tapas_base_finetuned_sqa</b></li>
319
+ <ul>
320
+ <li>Best for: answering questions in a sequential question-answering context, where the current question depends on previous answers.</li>
321
+ </ul>
322
+ </ul>
323
+ </div>
324
+ """, unsafe_allow_html=True)
325
+
326
+ # Conclusion
327
+ st.markdown("""
328
+ <div class="section">
329
+ <h4>Conclusion</h4>
330
+ <p>TAPAS, integrated with Spark NLP, provides a powerful solution for question-answering over tables, capable of handling both complex aggregation queries and straightforward Q&A tasks. Whether you're working with large datasets or simple tables, TAPAS offers flexibility and scalability. The <code>table_qa_tapas_base_finetuned_wtq</code> model excels in aggregation tasks, while <code>table_qa_tapas_base_finetuned_sqa</code> is best for direct, sequential question-answering.</p>
331
+ <p>By following this guide, you can efficiently implement TAPAS in your own projects, leveraging Spark NLP's powerful processing capabilities to extract insights from your data.</p>
332
+ </div>
333
+ """, unsafe_allow_html=True)
334
+
335
+ # References
336
+ st.markdown("""
337
+ <div class="section">
338
+ <h4>References</h4>
339
+ <ul>
340
+ <li>Documentation : <a class="link" href="https://nlp.johnsnowlabs.com/docs/en/annotators#multidocumentassembler" target="_blank" rel="noopener">MultiDocumentAssembler</a>, <a class="link" href="https://nlp.johnsnowlabs.com/docs/en/annotators#TapasForQuestionAnswering">TapasForQuestionAnswering</a></li>
341
+ <li>Python Doc : <a class="link" href="https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp/base/multi_document_assembler/index.html#sparknlp.base.multi_document_assembler.MultiDocumentAssembler.setIdCol" target="_blank" rel="noopener">MultiDocumentAssembler</a>, <a class="link" href="https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp/annotator/classifier_dl/tapas_for_question_answering/index.html" target="_blank" rel="noopener">TapasForQuestionAnswering</a></li>
342
+ <li>Scala Doc : <a class="link" href="https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/MultiDocumentAssembler.html" target="_blank" rel="noopener">MultiDocumentAssembler</a>, <a class="link" href="https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/annotators/classifier/dl/TapasForQuestionAnswering.html">TapasForQuestionAnswering</a></li>
343
+ <li>Models Used : <a class="link" href="https://sparknlp.org/2022/09/30/table_qa_tapas_base_finetuned_wtq_en.html" target="_blank" rel="noopener">table_qa_tapas_base_finetuned_wtq</a>, <a class="link" href="https://sparknlp.org/2022/09/30/table_qa_tapas_base_finetuned_sqa_en.html">table_qa_tapas_base_finetuned_sqa</a></li>
344
+ <li>For extended examples of usage, see the notebooks for <a class="link" href="https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-assembler/Loading_Multiple_Documents.ipynb" target="_blank" rel="noopener">MultiDocumentAssembler</a>, <a class="link" href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/15.1_Table_Question_Answering.ipynb" target="_blank" rel="noopener">TapasForQuestionAnswering</a>.</li>
345
+ <li><a href="https://arxiv.org/abs/2004.02349" class="link" target="_blank">TAPAS: Weakly Supervised Table Parsing via Pre-trained Language Models</a></li>
346
+ <li><a href="https://nlp.johnsnowlabs.com/" class="link" target="_blank">Spark NLP Documentation</a></li>
347
+ <li><a href="https://nlp.johnsnowlabs.com/models" class="link" target="_blank">John Snow Labs Models Hub</a></li>
348
+ </ul>
349
+ </div>
350
+ """, unsafe_allow_html=True)
351
+
352
+ # Community & Support
353
+ st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
354
+ st.markdown("""
355
+ <div class="section">
356
+ <ul>
357
+ <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
358
+ <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
359
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
360
+ <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
361
+ <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
362
+ </ul>
363
+ </div>
364
+ """, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ st-annotated-text
3
+ pandas
4
+ numpy
5
+ spark-nlp
6
+ pyspark