Spaces:

hertogateis
/

Table_QandA_v1

Sleeping

App Files Files Community

hertogateis commited on Jan 3

Commit

e4382ce

verified ·

1 Parent(s): 7ffe460

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -44

app.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import os
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import streamlit as st
 from st_aggrid import AgGrid
-import pandas as pd
 from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
 st.set_page_config(layout="wide")
 style = '''
     <style>
         body {background-color: #F5F5F5; color: #000000;}
@@ -23,45 +23,22 @@ style = '''
 '''
 st.markdown(style, unsafe_allow_html=True)
-st.markdown('<p style="font-family:sans-serif;font-size: 1.9rem;"> HertogAI Q&A table V1 using TAPAS and Text Generated</p>', unsafe_allow_html=True)
 st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'>Pre-trained TAPAS model runs on max 64 rows and 32 columns data. Make sure the file data doesn't exceed these dimensions.</p>", unsafe_allow_html=True)
-# Initialize TAPAS and Hugging Face Model (T5 for NLP generation)
 tqa = pipeline(task="table-question-answering",
               model="google/tapas-large-finetuned-wtq",
               device="cpu")
-model_name = "t5-small"  # You can use a larger model or GPT as needed
-tokenizer = T5Tokenizer.from_pretrained(model_name)
-model = T5ForConditionalGeneration.from_pretrained(model_name)
-# Function to generate natural language from TAPAS output
-def generate_nlp_from_tapas(tapas_output, df):
-    """
-    Use Hugging Face's T5 model to generate natural language text from TAPAS output.
-    """
-    try:
-        # Construct prompt using TAPAS output
-        answer = tapas_output['answer']
-        coordinates = tapas_output['coordinates']
-        answer_data = [df.iloc[row, col] for row, col in coordinates]
-        # Format the prompt for NLP model
-        prompt = f"Answer: {answer}. Data Location: Rows {coordinates}, Values: {answer_data}. Please summarize this information in a natural language sentence."
-        # Tokenize input and generate response
-        inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=512)
-        outputs = model.generate(inputs, max_length=100, num_beams=5, early_stopping=True)
-        # Decode and return the generated response
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return response
-    except Exception as e:
-        return f"Error generating response: {str(e)}"
 file_name = st.sidebar.file_uploader("Upload file:", type=['csv', 'xlsx'])
 if file_name is None:
     st.markdown('<p class="font">Please upload an excel or csv file </p>', unsafe_allow_html=True)
 else:
@@ -88,6 +65,7 @@ else:
             df_numeric = df.copy()
             df = df.astype(str)
             grid_response = AgGrid(
                 df.head(5),
                 columns_auto_size_mode='FIT_CONTENTS',
@@ -99,24 +77,50 @@ else:
     except Exception as e:
         st.error(f"Error reading file: {str(e)}")
     question = st.text_input('Type your question')
     with st.spinner():
-        if(st.button('Answer')):
             try:
                 # Get the raw answer from TAPAS
                 raw_answer = tqa(table=df, query=question, truncation=True)
-                st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Raw Result: </p>", unsafe_allow_html=True)
                 st.success(raw_answer)
-                # Use Hugging Face's T5 model to generate NLP text from TAPAS output
-                final_answer = generate_nlp_from_tapas(raw_answer, df)
-                # Display the generated answer in a simple format
-                st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Generated Answer: </p>", unsafe_allow_html=True)
-                st.success(final_answer)
             except Exception as e:
-                st.warning(f"Error: {str(e)} - Please retype your question and ensure it is correctly formatted.")

 import os
 import streamlit as st
 from st_aggrid import AgGrid
+import pandas as pd
 from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
+# Set the page layout for Streamlit
 st.set_page_config(layout="wide")
+# CSS styling
 style = '''
     <style>
         body {background-color: #F5F5F5; color: #000000;}
 '''
 st.markdown(style, unsafe_allow_html=True)
+st.markdown('<p style="font-family:sans-serif;font-size: 1.9rem;"> HertogAI Question Answering using TAPAS</p>', unsafe_allow_html=True)
 st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'>Pre-trained TAPAS model runs on max 64 rows and 32 columns data. Make sure the file data doesn't exceed these dimensions.</p>", unsafe_allow_html=True)
+# Initialize TAPAS pipeline
 tqa = pipeline(task="table-question-answering",
               model="google/tapas-large-finetuned-wtq",
               device="cpu")
+# Initialize T5 tokenizer and model for text generation
+t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
+t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
+# File uploader in the sidebar
 file_name = st.sidebar.file_uploader("Upload file:", type=['csv', 'xlsx'])
+# File processing and question answering
 if file_name is None:
     st.markdown('<p class="font">Please upload an excel or csv file </p>', unsafe_allow_html=True)
 else:
             df_numeric = df.copy()
             df = df.astype(str)
+            # Display the first 5 rows of the dataframe in an editable grid
             grid_response = AgGrid(
                 df.head(5),
                 columns_auto_size_mode='FIT_CONTENTS',
     except Exception as e:
         st.error(f"Error reading file: {str(e)}")
+    # User input for the question
     question = st.text_input('Type your question')
+    # Process the answer using TAPAS and T5
     with st.spinner():
+        if st.button('Answer'):
             try:
                 # Get the raw answer from TAPAS
                 raw_answer = tqa(table=df, query=question, truncation=True)
+                st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Raw Result: </p>",
+                           unsafe_allow_html=True)
                 st.success(raw_answer)
+                # Extract relevant information from the TAPAS result
+                answer = raw_answer['answer']
+                aggregator = raw_answer.get('aggregator', '')
+                coordinates = raw_answer.get('coordinates', [])
+                cells = raw_answer.get('cells', [])
+                # Create a base sentence
+                if aggregator:
+                    base_sentence = f"The {aggregator.lower()} of the selected data is {answer}."
+                    if coordinates and cells:
+                        rows_info = [f"Row {coordinate[0]+1}, Column {df.columns[coordinate[1]]} with value {cell}"
+                                     for coordinate, cell in zip(coordinates, cells)]
+                        rows_description = " and ".join(rows_info)
+                        base_sentence += f" This includes the following data: {rows_description}."
+                else:
+                    base_sentence = f"The answer is: {answer}"
+                # Construct the full input for T5 model by including the original question
+                input_text = f"Given the question: '{question}', generate a more human-readable response: {base_sentence}"
+                # Tokenize the input and generate a fluent response using T5
+                inputs = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
+                summary_ids = t5_model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)
+                # Decode the generated text
+                generated_text = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+                # Display the final generated response
+                st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Final Generated Response: </p>", unsafe_allow_html=True)
+                st.success(generated_text)
             except Exception as e:
+                st.warning("Please retype your question and make sure to use the column name and cell value correctly.")