Spaces:
Build error
Build error
Commit
·
512f2de
1
Parent(s):
cd0ba3c
iloc errors around chunking and token length plus script for allowing ctrl+enter submissions
Browse filesErrors:
FutureWarnings: These are warnings indicating that the code is using deprecated features that may be removed in future versions of the libraries. While not urgent, it's good to address these warnings to future-proof your code.
You can replace 'DataFrame.swapaxes' with 'DataFrame.transpose'.
Replace instances where you use Series.__getitem__ as positions with Series.iloc.
Token indices sequence length: This indicates that the tokenized input sequence is longer than the maximum allowed length for the model (512 tokens for TAPAS). You need to manage the size of your input sequence.
Also added script to accept "Ctrl+Enter" as Submit
app.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
-
import numpy as np
|
4 |
from io import StringIO
|
5 |
from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering
|
|
|
6 |
|
7 |
# Initialize TAPAS model and tokenizer
|
8 |
tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
|
@@ -10,40 +10,34 @@ model = AutoModelForTableQuestionAnswering.from_pretrained("google/tapas-large-f
|
|
10 |
|
11 |
def ask_llm_chunk(chunk, questions):
|
12 |
chunk = chunk.astype(str)
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
answers = []
|
28 |
for coordinates in predicted_answer_coordinates:
|
29 |
if len(coordinates) == 1:
|
30 |
-
answers.append(chunk.
|
31 |
else:
|
32 |
cell_values = []
|
33 |
for coordinate in coordinates:
|
34 |
-
cell_values.append(chunk.
|
35 |
answers.append(", ".join(cell_values))
|
36 |
return answers
|
37 |
|
38 |
-
MAX_ROWS_PER_CHUNK =
|
39 |
|
40 |
def summarize_map_reduce(data, questions):
|
41 |
-
|
42 |
-
dataframe = pd.read_csv(StringIO(data))
|
43 |
-
except Exception as e:
|
44 |
-
st.write(f"Error reading the CSV file: {e}")
|
45 |
-
return []
|
46 |
-
|
47 |
num_chunks = len(dataframe) // MAX_ROWS_PER_CHUNK + 1
|
48 |
dataframe_chunks = np.array_split(dataframe, num_chunks)
|
49 |
all_answers = []
|
@@ -68,11 +62,21 @@ if csv_file is not None:
|
|
68 |
|
69 |
if st.button("Submit"):
|
70 |
if data and questions:
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
3 |
from io import StringIO
|
4 |
from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering
|
5 |
+
import numpy as np
|
6 |
|
7 |
# Initialize TAPAS model and tokenizer
|
8 |
tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
|
|
|
10 |
|
11 |
def ask_llm_chunk(chunk, questions):
|
12 |
chunk = chunk.astype(str)
|
13 |
+
inputs = tokenizer(table=chunk, queries=questions, padding="max_length", return_tensors="pt")
|
14 |
+
|
15 |
+
# Check for token limit
|
16 |
+
if inputs["input_ids"].shape[1] > 512:
|
17 |
+
st.warning("Token limit exceeded for chunk")
|
18 |
+
return ["Token limit exceeded for chunk"] * len(questions)
|
19 |
+
|
20 |
+
outputs = model(**inputs)
|
21 |
+
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
|
22 |
+
inputs,
|
23 |
+
outputs.logits.detach(),
|
24 |
+
outputs.logits_aggregation.detach()
|
25 |
+
)
|
|
|
26 |
answers = []
|
27 |
for coordinates in predicted_answer_coordinates:
|
28 |
if len(coordinates) == 1:
|
29 |
+
answers.append(chunk.iloc[coordinates[0]].values)
|
30 |
else:
|
31 |
cell_values = []
|
32 |
for coordinate in coordinates:
|
33 |
+
cell_values.append(chunk.iloc[coordinate].values)
|
34 |
answers.append(", ".join(cell_values))
|
35 |
return answers
|
36 |
|
37 |
+
MAX_ROWS_PER_CHUNK = 200
|
38 |
|
39 |
def summarize_map_reduce(data, questions):
|
40 |
+
dataframe = pd.read_csv(StringIO(data))
|
|
|
|
|
|
|
|
|
|
|
41 |
num_chunks = len(dataframe) // MAX_ROWS_PER_CHUNK + 1
|
42 |
dataframe_chunks = np.array_split(dataframe, num_chunks)
|
43 |
all_answers = []
|
|
|
62 |
|
63 |
if st.button("Submit"):
|
64 |
if data and questions:
|
65 |
+
answers = summarize_map_reduce(data, questions)
|
66 |
+
st.write("Answers:")
|
67 |
+
for q, a in zip(questions, answers):
|
68 |
+
st.write(f"Question: {q}")
|
69 |
+
st.write(f"Answer: {a}")
|
70 |
+
|
71 |
+
# Add Ctrl+Enter functionality for submitting the questions
|
72 |
+
st.markdown("""
|
73 |
+
<script>
|
74 |
+
document.addEventListener("DOMContentLoaded", function(event) {
|
75 |
+
document.addEventListener("keydown", function(event) {
|
76 |
+
if (event.ctrlKey && event.key === "Enter") {
|
77 |
+
document.querySelector(".stButton button").click();
|
78 |
+
}
|
79 |
+
});
|
80 |
+
});
|
81 |
+
</script>
|
82 |
+
""", unsafe_allow_html=True)
|