Spaces:

jskinner215
/

TAPAS_WTQ_Chunking

Build error

jskinner215 commited on Aug 31, 2023

Commit

512f2de

1 Parent(s): cd0ba3c

iloc errors around chunking and token length plus script for allowing ctrl+enter submissions

Errors:

FutureWarnings: These are warnings indicating that the code is using deprecated features that may be removed in future versions of the libraries. While not urgent, it's good to address these warnings to future-proof your code.
You can replace 'DataFrame.swapaxes' with 'DataFrame.transpose'.
Replace instances where you use Series.__getitem__ as positions with Series.iloc.
Token indices sequence length: This indicates that the tokenized input sequence is longer than the maximum allowed length for the model (512 tokens for TAPAS). You need to manage the size of your input sequence.

Also added script to accept "Ctrl+Enter" as Submit

Files changed (1) hide show

app.py +36 -32

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import streamlit as st
 import pandas as pd
-import numpy as np
 from io import StringIO
 from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering
 # Initialize TAPAS model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
@@ -10,40 +10,34 @@ model = AutoModelForTableQuestionAnswering.from_pretrained("google/tapas-large-f
 def ask_llm_chunk(chunk, questions):
     chunk = chunk.astype(str)
-    try:
-        inputs = tokenizer(table=chunk, queries=questions, padding="max_length", return_tensors="pt")
-        if inputs["input_ids"].shape[1] > 512:
-            return ["Token limit exceeded for this chunk"] * len(questions)
-        outputs = model(**inputs)
-        predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
-            inputs,
-            outputs.logits.detach(),
-            outputs.logits_aggregation.detach()
-        )
-    except Exception as e:
-        st.write(f"An error occurred: {e}")
-        return ["Error processing this chunk"] * len(questions)
     answers = []
     for coordinates in predicted_answer_coordinates:
         if len(coordinates) == 1:
-            answers.append(chunk.iat[coordinates[0]])
         else:
             cell_values = []
             for coordinate in coordinates:
-                cell_values.append(chunk.iat[coordinate])
             answers.append(", ".join(cell_values))
     return answers
-MAX_ROWS_PER_CHUNK = 50  # Reduced chunk size
 def summarize_map_reduce(data, questions):
-    try:
-        dataframe = pd.read_csv(StringIO(data))
-    except Exception as e:
-        st.write(f"Error reading the CSV file: {e}")
-        return []
     num_chunks = len(dataframe) // MAX_ROWS_PER_CHUNK + 1
     dataframe_chunks = np.array_split(dataframe, num_chunks)
     all_answers = []
@@ -68,11 +62,21 @@ if csv_file is not None:
     if st.button("Submit"):
         if data and questions:
-            try:
-                answers = summarize_map_reduce(data, questions)
-                st.write("Answers:")
-                for q, a in zip(questions, answers):
-                    st.write(f"Question: {q}")
-                    st.write(f"Answer: {a}")
-            except Exception as e:
-                st.write(f"An error occurred: {e}")

 import streamlit as st
 import pandas as pd
 from io import StringIO
 from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering
+import numpy as np
 # Initialize TAPAS model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
 def ask_llm_chunk(chunk, questions):
     chunk = chunk.astype(str)
+    inputs = tokenizer(table=chunk, queries=questions, padding="max_length", return_tensors="pt")
+    # Check for token limit
+    if inputs["input_ids"].shape[1] > 512:
+        st.warning("Token limit exceeded for chunk")
+        return ["Token limit exceeded for chunk"] * len(questions)
+    outputs = model(**inputs)
+    predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
+        inputs,
+        outputs.logits.detach(),
+        outputs.logits_aggregation.detach()
+    )
     answers = []
     for coordinates in predicted_answer_coordinates:
         if len(coordinates) == 1:
+            answers.append(chunk.iloc[coordinates[0]].values)
         else:
             cell_values = []
             for coordinate in coordinates:
+                cell_values.append(chunk.iloc[coordinate].values)
             answers.append(", ".join(cell_values))
     return answers
+MAX_ROWS_PER_CHUNK = 200
 def summarize_map_reduce(data, questions):
+    dataframe = pd.read_csv(StringIO(data))
     num_chunks = len(dataframe) // MAX_ROWS_PER_CHUNK + 1
     dataframe_chunks = np.array_split(dataframe, num_chunks)
     all_answers = []
     if st.button("Submit"):
         if data and questions:
+            answers = summarize_map_reduce(data, questions)
+            st.write("Answers:")
+            for q, a in zip(questions, answers):
+                st.write(f"Question: {q}")
+                st.write(f"Answer: {a}")
+# Add Ctrl+Enter functionality for submitting the questions
+st.markdown("""
+    <script>
+    document.addEventListener("DOMContentLoaded", function(event) {
+        document.addEventListener("keydown", function(event) {
+            if (event.ctrlKey && event.key === "Enter") {
+                document.querySelector(".stButton button").click();
+            }
+        });
+    });
+    </script>
+    """, unsafe_allow_html=True)