abdullahmubeen10 commited on
Commit
ca1575d
·
verified ·
1 Parent(s): c309169

Update Demo.py

Browse files
Files changed (1) hide show
  1. Demo.py +180 -175
Demo.py CHANGED
@@ -1,175 +1,180 @@
1
- import streamlit as st
2
- import sparknlp
3
- import pandas as pd
4
- import json
5
-
6
- from sparknlp.base import *
7
- from sparknlp.annotator import *
8
- from pyspark.ml import Pipeline
9
- from sparknlp.pretrained import PretrainedPipeline
10
-
11
- # Page configuration
12
- st.set_page_config(
13
- layout="wide",
14
- initial_sidebar_state="auto"
15
- )
16
-
17
- # CSS for styling
18
- st.markdown("""
19
- <style>
20
- .main-title {
21
- font-size: 36px;
22
- color: #4A90E2;
23
- font-weight: bold;
24
- text-align: center;
25
- }
26
- .section {
27
- background-color: #f9f9f9;
28
- padding: 10px;
29
- border-radius: 10px;
30
- margin-top: 10px;
31
- }
32
- .section p, .section ul {
33
- color: #666666;
34
- }
35
- </style>
36
- """, unsafe_allow_html=True)
37
-
38
- @st.cache_resource
39
- def init_spark():
40
- return sparknlp.start()
41
-
42
- @st.cache_resource
43
- def create_pipeline(model):
44
- document_assembler = MultiDocumentAssembler() \
45
- .setInputCols("table_json", "questions") \
46
- .setOutputCols("document_table", "document_questions")
47
-
48
- sentence_detector = SentenceDetector() \
49
- .setInputCols(["document_questions"]) \
50
- .setOutputCol("questions")
51
-
52
- table_assembler = TableAssembler()\
53
- .setInputCols(["document_table"])\
54
- .setOutputCol("table")
55
-
56
- tapas_wtq = TapasForQuestionAnswering\
57
- .pretrained("table_qa_tapas_base_finetuned_wtq", "en")\
58
- .setInputCols(["questions", "table"])\
59
- .setOutputCol("answers_wtq")
60
-
61
- tapas_sqa = TapasForQuestionAnswering\
62
- .pretrained("table_qa_tapas_base_finetuned_sqa", "en")\
63
- .setInputCols(["questions", "table"])\
64
- .setOutputCol("answers_sqa")
65
-
66
- pipeline = Pipeline(stages=[document_assembler, sentence_detector, table_assembler, tapas_wtq, tapas_sqa])
67
- return pipeline
68
-
69
- def fit_data(pipeline, json_data, question):
70
- spark_df = spark.createDataFrame([[json_data, question]]).toDF("table_json", "questions")
71
- model = pipeline.fit(spark_df)
72
- result = model.transform(spark_df)
73
- return result.select("answers_wtq.result", "answers_sqa.result").collect()
74
-
75
- # Sidebar content
76
- model = st.sidebar.selectbox(
77
- "Choose the pretrained model",
78
- ["table_qa_tapas_base_finetuned_wtq", "table_qa_tapas_base_finetuned_sqa"],
79
- help="For more info about the models visit: https://sparknlp.org/models"
80
- )
81
-
82
- # Set up the page layout
83
- title = 'TAPAS for Table-Based Question Answering with Spark NLP'
84
- sub_title = (
85
- 'TAPAS (Table Parsing Supervised via Pre-trained Language Models) is a model that extends '
86
- 'the BERT architecture to handle tabular data. Unlike traditional models that require flattening '
87
- 'tables into text, TAPAS can directly interpret tables, making it a powerful tool for answering '
88
- 'questions that involve tabular data.'
89
- )
90
-
91
- st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
92
- st.markdown(f'<div class="section"><p>{sub_title}</p></div>', unsafe_allow_html=True)
93
-
94
- # Reference notebook link in sidebar
95
- link = """
96
- <a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/15.1_Table_Question_Answering.ipynb">
97
- <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
98
- </a>
99
- """
100
- st.sidebar.markdown('Reference notebook:')
101
- st.sidebar.markdown(link, unsafe_allow_html=True)
102
-
103
- # Define the JSON data for the table
104
- # New JSON data
105
- json_data = '''
106
- {
107
- "header": ["name", "net_worth", "age", "nationality", "company", "industry"],
108
- "rows": [
109
- ["Elon Musk", "$200,000,000,000", "52", "American", "Tesla, SpaceX", "Automotive, Aerospace"],
110
- ["Jeff Bezos", "$150,000,000,000", "60", "American", "Amazon", "E-commerce"],
111
- ["Bernard Arnault", "$210,000,000,000", "74", "French", "LVMH", "Luxury Goods"],
112
- ["Bill Gates", "$120,000,000,000", "68", "American", "Microsoft", "Technology"],
113
- ["Warren Buffett", "$110,000,000,000", "93", "American", "Berkshire Hathaway", "Conglomerate"],
114
- ["Larry Page", "$100,000,000,000", "51", "American", "Google", "Technology"],
115
- ["Mark Zuckerberg", "$85,000,000,000", "40", "American", "Meta", "Social Media"],
116
- ["Mukesh Ambani", "$80,000,000,000", "67", "Indian", "Reliance Industries", "Conglomerate"],
117
- ["Alice Walton", "$65,000,000,000", "74", "American", "Walmart", "Retail"],
118
- ["Francoise Bettencourt Meyers", "$70,000,000,000", "70", "French", "L'Oreal", "Cosmetics"],
119
- ["Amancio Ortega", "$75,000,000,000", "88", "Spanish", "Inditex (Zara)", "Retail"],
120
- ["Carlos Slim", "$55,000,000,000", "84", "Mexican", "America Movil", "Telecom"]
121
- ]
122
- }
123
- '''
124
-
125
- # Define queries for selection
126
- queries = [
127
- "Who has a higher net worth, Bernard Arnault or Jeff Bezos?",
128
- "List the top three individuals by net worth.",
129
- "Who is the richest person in the technology industry?",
130
- "Which company in the e-commerce industry has the highest net worth?",
131
- "Who is the oldest billionaire on the list?",
132
- "Which individual under the age of 60 has the highest net worth?",
133
- "Who is the wealthiest American, and which company do they own?",
134
- "Find all French billionaires and list their companies.",
135
- "How many women are on the list, and what are their total net worths?",
136
- "Who is the wealthiest non-American on the list?",
137
- "Find the person who is the youngest and has a net worth over $100 billion.",
138
- "Who owns companies in more than one industry, and what are those industries?",
139
- "What is the total net worth of all individuals over 70?",
140
- "How many billionaires are in the conglomerate industry?"
141
- ]
142
-
143
-
144
- # Load the JSON data into a DataFrame and display it
145
- table_data = json.loads(json_data)
146
- df_table = pd.DataFrame(table_data["rows"], columns=table_data["header"])
147
- df_table.index += 1
148
-
149
- st.write("")
150
- st.write("Context DataFrame (Click To Edit)")
151
- edited_df = st.data_editor(df_table)
152
-
153
- # Convert edited DataFrame back to JSON format
154
- table_json_data = {
155
- "header": edited_df.columns.tolist(),
156
- "rows": edited_df.values.tolist()
157
- }
158
- table_json_str = json.dumps(table_json_data)
159
-
160
- # User input for questions
161
- selected_text = st.selectbox("Question Query", queries)
162
- custom_input = st.text_input("Try it with your own Question!")
163
- text_to_analyze = custom_input if custom_input else selected_text
164
-
165
- # Initialize Spark and create the pipeline
166
- spark = init_spark()
167
- pipeline = create_pipeline(model)
168
-
169
- # Run the pipeline with the selected query and the converted table data
170
- output = fit_data(pipeline, table_json_str, text_to_analyze)
171
-
172
- # Display the output
173
- st.markdown("---")
174
- st.subheader("Processed output:")
175
- st.write("**Answer:**", ', '.join(output[0][0]))
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import pandas as pd
4
+ import json
5
+
6
+ from sparknlp.base import *
7
+ from sparknlp.annotator import *
8
+ from pyspark.ml import Pipeline
9
+ from sparknlp.pretrained import PretrainedPipeline
10
+
11
+ # Page configuration
12
+ st.set_page_config(
13
+ layout="wide",
14
+ initial_sidebar_state="auto"
15
+ )
16
+
17
+ # CSS for styling
18
+ st.markdown("""
19
+ <style>
20
+ .main-title {
21
+ font-size: 36px;
22
+ color: #4A90E2;
23
+ font-weight: bold;
24
+ text-align: center;
25
+ }
26
+ .section {
27
+ background-color: #f9f9f9;
28
+ padding: 10px;
29
+ border-radius: 10px;
30
+ margin-top: 10px;
31
+ }
32
+ .section p, .section ul {
33
+ color: #666666;
34
+ }
35
+ </style>
36
+ """, unsafe_allow_html=True)
37
+
38
+ @st.cache_resource
39
+ def init_spark():
40
+ from pyspark.sql import SparkSession
41
+ spark = SparkSession.builder \
42
+ .config("spark.pyspark.python", "/usr/bin/python3.8") \
43
+ .config("spark.pyspark.driver.python", "/usr/bin/python3.8") \
44
+ .getOrCreate()
45
+ return spark
46
+
47
+ @st.cache_resource
48
+ def create_pipeline(model):
49
+ document_assembler = MultiDocumentAssembler() \
50
+ .setInputCols("table_json", "questions") \
51
+ .setOutputCols("document_table", "document_questions")
52
+
53
+ sentence_detector = SentenceDetector() \
54
+ .setInputCols(["document_questions"]) \
55
+ .setOutputCol("questions")
56
+
57
+ table_assembler = TableAssembler()\
58
+ .setInputCols(["document_table"])\
59
+ .setOutputCol("table")
60
+
61
+ tapas_wtq = TapasForQuestionAnswering\
62
+ .pretrained("table_qa_tapas_base_finetuned_wtq", "en")\
63
+ .setInputCols(["questions", "table"])\
64
+ .setOutputCol("answers_wtq")
65
+
66
+ tapas_sqa = TapasForQuestionAnswering\
67
+ .pretrained("table_qa_tapas_base_finetuned_sqa", "en")\
68
+ .setInputCols(["questions", "table"])\
69
+ .setOutputCol("answers_sqa")
70
+
71
+ pipeline = Pipeline(stages=[document_assembler, sentence_detector, table_assembler, tapas_wtq, tapas_sqa])
72
+ return pipeline
73
+
74
+ def fit_data(pipeline, json_data, question):
75
+ spark_df = spark.createDataFrame([[json_data, question]]).toDF("table_json", "questions")
76
+ model = pipeline.fit(spark_df)
77
+ result = model.transform(spark_df)
78
+ return result.select("answers_wtq.result", "answers_sqa.result").collect()
79
+
80
+ # Sidebar content
81
+ model = st.sidebar.selectbox(
82
+ "Choose the pretrained model",
83
+ ["table_qa_tapas_base_finetuned_wtq", "table_qa_tapas_base_finetuned_sqa"],
84
+ help="For more info about the models visit: https://sparknlp.org/models"
85
+ )
86
+
87
+ # Set up the page layout
88
+ title = 'TAPAS for Table-Based Question Answering with Spark NLP'
89
+ sub_title = (
90
+ 'TAPAS (Table Parsing Supervised via Pre-trained Language Models) is a model that extends '
91
+ 'the BERT architecture to handle tabular data. Unlike traditional models that require flattening '
92
+ 'tables into text, TAPAS can directly interpret tables, making it a powerful tool for answering '
93
+ 'questions that involve tabular data.'
94
+ )
95
+
96
+ st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
97
+ st.markdown(f'<div class="section"><p>{sub_title}</p></div>', unsafe_allow_html=True)
98
+
99
+ # Reference notebook link in sidebar
100
+ link = """
101
+ <a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/15.1_Table_Question_Answering.ipynb">
102
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
103
+ </a>
104
+ """
105
+ st.sidebar.markdown('Reference notebook:')
106
+ st.sidebar.markdown(link, unsafe_allow_html=True)
107
+
108
+ # Define the JSON data for the table
109
+ # New JSON data
110
+ json_data = '''
111
+ {
112
+ "header": ["name", "net_worth", "age", "nationality", "company", "industry"],
113
+ "rows": [
114
+ ["Elon Musk", "$200,000,000,000", "52", "American", "Tesla, SpaceX", "Automotive, Aerospace"],
115
+ ["Jeff Bezos", "$150,000,000,000", "60", "American", "Amazon", "E-commerce"],
116
+ ["Bernard Arnault", "$210,000,000,000", "74", "French", "LVMH", "Luxury Goods"],
117
+ ["Bill Gates", "$120,000,000,000", "68", "American", "Microsoft", "Technology"],
118
+ ["Warren Buffett", "$110,000,000,000", "93", "American", "Berkshire Hathaway", "Conglomerate"],
119
+ ["Larry Page", "$100,000,000,000", "51", "American", "Google", "Technology"],
120
+ ["Mark Zuckerberg", "$85,000,000,000", "40", "American", "Meta", "Social Media"],
121
+ ["Mukesh Ambani", "$80,000,000,000", "67", "Indian", "Reliance Industries", "Conglomerate"],
122
+ ["Alice Walton", "$65,000,000,000", "74", "American", "Walmart", "Retail"],
123
+ ["Francoise Bettencourt Meyers", "$70,000,000,000", "70", "French", "L'Oreal", "Cosmetics"],
124
+ ["Amancio Ortega", "$75,000,000,000", "88", "Spanish", "Inditex (Zara)", "Retail"],
125
+ ["Carlos Slim", "$55,000,000,000", "84", "Mexican", "America Movil", "Telecom"]
126
+ ]
127
+ }
128
+ '''
129
+
130
+ # Define queries for selection
131
+ queries = [
132
+ "Who has a higher net worth, Bernard Arnault or Jeff Bezos?",
133
+ "List the top three individuals by net worth.",
134
+ "Who is the richest person in the technology industry?",
135
+ "Which company in the e-commerce industry has the highest net worth?",
136
+ "Who is the oldest billionaire on the list?",
137
+ "Which individual under the age of 60 has the highest net worth?",
138
+ "Who is the wealthiest American, and which company do they own?",
139
+ "Find all French billionaires and list their companies.",
140
+ "How many women are on the list, and what are their total net worths?",
141
+ "Who is the wealthiest non-American on the list?",
142
+ "Find the person who is the youngest and has a net worth over $100 billion.",
143
+ "Who owns companies in more than one industry, and what are those industries?",
144
+ "What is the total net worth of all individuals over 70?",
145
+ "How many billionaires are in the conglomerate industry?"
146
+ ]
147
+
148
+
149
+ # Load the JSON data into a DataFrame and display it
150
+ table_data = json.loads(json_data)
151
+ df_table = pd.DataFrame(table_data["rows"], columns=table_data["header"])
152
+ df_table.index += 1
153
+
154
+ st.write("")
155
+ st.write("Context DataFrame (Click To Edit)")
156
+ edited_df = st.data_editor(df_table)
157
+
158
+ # Convert edited DataFrame back to JSON format
159
+ table_json_data = {
160
+ "header": edited_df.columns.tolist(),
161
+ "rows": edited_df.values.tolist()
162
+ }
163
+ table_json_str = json.dumps(table_json_data)
164
+
165
+ # User input for questions
166
+ selected_text = st.selectbox("Question Query", queries)
167
+ custom_input = st.text_input("Try it with your own Question!")
168
+ text_to_analyze = custom_input if custom_input else selected_text
169
+
170
+ # Initialize Spark and create the pipeline
171
+ spark = init_spark()
172
+ pipeline = create_pipeline(model)
173
+
174
+ # Run the pipeline with the selected query and the converted table data
175
+ output = fit_data(pipeline, table_json_str, text_to_analyze)
176
+
177
+ # Display the output
178
+ st.markdown("---")
179
+ st.subheader("Processed output:")
180
+ st.write("**Answer:**", ', '.join(output[0][0]))