Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
import os
|
2 |
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
3 |
-
|
4 |
import streamlit as st
|
5 |
from st_aggrid import AgGrid
|
6 |
-
import pandas as pd
|
7 |
from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
|
8 |
|
|
|
9 |
st.set_page_config(layout="wide")
|
10 |
|
|
|
11 |
style = '''
|
12 |
<style>
|
13 |
body {background-color: #F5F5F5; color: #000000;}
|
@@ -23,45 +23,22 @@ style = '''
|
|
23 |
'''
|
24 |
st.markdown(style, unsafe_allow_html=True)
|
25 |
|
26 |
-
st.markdown('<p style="font-family:sans-serif;font-size: 1.9rem;"> HertogAI
|
27 |
st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'>Pre-trained TAPAS model runs on max 64 rows and 32 columns data. Make sure the file data doesn't exceed these dimensions.</p>", unsafe_allow_html=True)
|
28 |
|
29 |
-
# Initialize TAPAS
|
30 |
tqa = pipeline(task="table-question-answering",
|
31 |
model="google/tapas-large-finetuned-wtq",
|
32 |
device="cpu")
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
# Function to generate natural language from TAPAS output
|
39 |
-
def generate_nlp_from_tapas(tapas_output, df):
|
40 |
-
"""
|
41 |
-
Use Hugging Face's T5 model to generate natural language text from TAPAS output.
|
42 |
-
"""
|
43 |
-
try:
|
44 |
-
# Construct prompt using TAPAS output
|
45 |
-
answer = tapas_output['answer']
|
46 |
-
coordinates = tapas_output['coordinates']
|
47 |
-
answer_data = [df.iloc[row, col] for row, col in coordinates]
|
48 |
-
|
49 |
-
# Format the prompt for NLP model
|
50 |
-
prompt = f"Answer: {answer}. Data Location: Rows {coordinates}, Values: {answer_data}. Please summarize this information in a natural language sentence."
|
51 |
-
|
52 |
-
# Tokenize input and generate response
|
53 |
-
inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=512)
|
54 |
-
outputs = model.generate(inputs, max_length=100, num_beams=5, early_stopping=True)
|
55 |
-
|
56 |
-
# Decode and return the generated response
|
57 |
-
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
58 |
-
return response
|
59 |
-
except Exception as e:
|
60 |
-
return f"Error generating response: {str(e)}"
|
61 |
-
|
62 |
|
|
|
63 |
file_name = st.sidebar.file_uploader("Upload file:", type=['csv', 'xlsx'])
|
64 |
|
|
|
65 |
if file_name is None:
|
66 |
st.markdown('<p class="font">Please upload an excel or csv file </p>', unsafe_allow_html=True)
|
67 |
else:
|
@@ -88,6 +65,7 @@ else:
|
|
88 |
df_numeric = df.copy()
|
89 |
df = df.astype(str)
|
90 |
|
|
|
91 |
grid_response = AgGrid(
|
92 |
df.head(5),
|
93 |
columns_auto_size_mode='FIT_CONTENTS',
|
@@ -99,24 +77,50 @@ else:
|
|
99 |
except Exception as e:
|
100 |
st.error(f"Error reading file: {str(e)}")
|
101 |
|
|
|
102 |
question = st.text_input('Type your question')
|
103 |
-
|
|
|
104 |
with st.spinner():
|
105 |
-
if
|
106 |
try:
|
107 |
# Get the raw answer from TAPAS
|
108 |
raw_answer = tqa(table=df, query=question, truncation=True)
|
109 |
|
110 |
-
st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Raw Result: </p>",
|
|
|
111 |
st.success(raw_answer)
|
112 |
|
113 |
-
#
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
except Exception as e:
|
121 |
-
st.warning(
|
122 |
-
|
|
|
1 |
import os
|
|
|
|
|
2 |
import streamlit as st
|
3 |
from st_aggrid import AgGrid
|
4 |
+
import pandas as pd
|
5 |
from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
|
6 |
|
7 |
+
# Set the page layout for Streamlit
|
8 |
st.set_page_config(layout="wide")
|
9 |
|
10 |
+
# CSS styling
|
11 |
style = '''
|
12 |
<style>
|
13 |
body {background-color: #F5F5F5; color: #000000;}
|
|
|
23 |
'''
|
24 |
st.markdown(style, unsafe_allow_html=True)
|
25 |
|
26 |
+
st.markdown('<p style="font-family:sans-serif;font-size: 1.9rem;"> HertogAI Question Answering using TAPAS</p>', unsafe_allow_html=True)
|
27 |
st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'>Pre-trained TAPAS model runs on max 64 rows and 32 columns data. Make sure the file data doesn't exceed these dimensions.</p>", unsafe_allow_html=True)
|
28 |
|
29 |
+
# Initialize TAPAS pipeline
|
30 |
tqa = pipeline(task="table-question-answering",
|
31 |
model="google/tapas-large-finetuned-wtq",
|
32 |
device="cpu")
|
33 |
|
34 |
+
# Initialize T5 tokenizer and model for text generation
|
35 |
+
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
36 |
+
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
# File uploader in the sidebar
|
39 |
file_name = st.sidebar.file_uploader("Upload file:", type=['csv', 'xlsx'])
|
40 |
|
41 |
+
# File processing and question answering
|
42 |
if file_name is None:
|
43 |
st.markdown('<p class="font">Please upload an excel or csv file </p>', unsafe_allow_html=True)
|
44 |
else:
|
|
|
65 |
df_numeric = df.copy()
|
66 |
df = df.astype(str)
|
67 |
|
68 |
+
# Display the first 5 rows of the dataframe in an editable grid
|
69 |
grid_response = AgGrid(
|
70 |
df.head(5),
|
71 |
columns_auto_size_mode='FIT_CONTENTS',
|
|
|
77 |
except Exception as e:
|
78 |
st.error(f"Error reading file: {str(e)}")
|
79 |
|
80 |
+
# User input for the question
|
81 |
question = st.text_input('Type your question')
|
82 |
+
|
83 |
+
# Process the answer using TAPAS and T5
|
84 |
with st.spinner():
|
85 |
+
if st.button('Answer'):
|
86 |
try:
|
87 |
# Get the raw answer from TAPAS
|
88 |
raw_answer = tqa(table=df, query=question, truncation=True)
|
89 |
|
90 |
+
st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Raw Result: </p>",
|
91 |
+
unsafe_allow_html=True)
|
92 |
st.success(raw_answer)
|
93 |
|
94 |
+
# Extract relevant information from the TAPAS result
|
95 |
+
answer = raw_answer['answer']
|
96 |
+
aggregator = raw_answer.get('aggregator', '')
|
97 |
+
coordinates = raw_answer.get('coordinates', [])
|
98 |
+
cells = raw_answer.get('cells', [])
|
99 |
+
|
100 |
+
# Create a base sentence
|
101 |
+
if aggregator:
|
102 |
+
base_sentence = f"The {aggregator.lower()} of the selected data is {answer}."
|
103 |
+
if coordinates and cells:
|
104 |
+
rows_info = [f"Row {coordinate[0]+1}, Column {df.columns[coordinate[1]]} with value {cell}"
|
105 |
+
for coordinate, cell in zip(coordinates, cells)]
|
106 |
+
rows_description = " and ".join(rows_info)
|
107 |
+
base_sentence += f" This includes the following data: {rows_description}."
|
108 |
+
else:
|
109 |
+
base_sentence = f"The answer is: {answer}"
|
110 |
+
|
111 |
+
# Construct the full input for T5 model by including the original question
|
112 |
+
input_text = f"Given the question: '{question}', generate a more human-readable response: {base_sentence}"
|
113 |
+
|
114 |
+
# Tokenize the input and generate a fluent response using T5
|
115 |
+
inputs = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
|
116 |
+
summary_ids = t5_model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)
|
117 |
+
|
118 |
+
# Decode the generated text
|
119 |
+
generated_text = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
120 |
+
|
121 |
+
# Display the final generated response
|
122 |
+
st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Final Generated Response: </p>", unsafe_allow_html=True)
|
123 |
+
st.success(generated_text)
|
124 |
|
125 |
except Exception as e:
|
126 |
+
st.warning("Please retype your question and make sure to use the column name and cell value correctly.")
|
|