jskinner215 commited on
Commit
512f2de
·
1 Parent(s): cd0ba3c

iloc errors around chunking and token length plus script for allowing ctrl+enter submissions

Browse files

Errors:

FutureWarnings: These are warnings indicating that the code is using deprecated features that may be removed in future versions of the libraries. While not urgent, it's good to address these warnings to future-proof your code.
You can replace 'DataFrame.swapaxes' with 'DataFrame.transpose'.
Replace instances where you use Series.__getitem__ as positions with Series.iloc.
Token indices sequence length: This indicates that the tokenized input sequence is longer than the maximum allowed length for the model (512 tokens for TAPAS). You need to manage the size of your input sequence.


Also added script to accept "Ctrl+Enter" as Submit

Files changed (1) hide show
  1. app.py +36 -32
app.py CHANGED
@@ -1,8 +1,8 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import numpy as np
4
  from io import StringIO
5
  from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering
 
6
 
7
  # Initialize TAPAS model and tokenizer
8
  tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
@@ -10,40 +10,34 @@ model = AutoModelForTableQuestionAnswering.from_pretrained("google/tapas-large-f
10
 
11
  def ask_llm_chunk(chunk, questions):
12
  chunk = chunk.astype(str)
13
- try:
14
- inputs = tokenizer(table=chunk, queries=questions, padding="max_length", return_tensors="pt")
15
- if inputs["input_ids"].shape[1] > 512:
16
- return ["Token limit exceeded for this chunk"] * len(questions)
17
- outputs = model(**inputs)
18
- predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
19
- inputs,
20
- outputs.logits.detach(),
21
- outputs.logits_aggregation.detach()
22
- )
23
- except Exception as e:
24
- st.write(f"An error occurred: {e}")
25
- return ["Error processing this chunk"] * len(questions)
26
-
27
  answers = []
28
  for coordinates in predicted_answer_coordinates:
29
  if len(coordinates) == 1:
30
- answers.append(chunk.iat[coordinates[0]])
31
  else:
32
  cell_values = []
33
  for coordinate in coordinates:
34
- cell_values.append(chunk.iat[coordinate])
35
  answers.append(", ".join(cell_values))
36
  return answers
37
 
38
- MAX_ROWS_PER_CHUNK = 50 # Reduced chunk size
39
 
40
  def summarize_map_reduce(data, questions):
41
- try:
42
- dataframe = pd.read_csv(StringIO(data))
43
- except Exception as e:
44
- st.write(f"Error reading the CSV file: {e}")
45
- return []
46
-
47
  num_chunks = len(dataframe) // MAX_ROWS_PER_CHUNK + 1
48
  dataframe_chunks = np.array_split(dataframe, num_chunks)
49
  all_answers = []
@@ -68,11 +62,21 @@ if csv_file is not None:
68
 
69
  if st.button("Submit"):
70
  if data and questions:
71
- try:
72
- answers = summarize_map_reduce(data, questions)
73
- st.write("Answers:")
74
- for q, a in zip(questions, answers):
75
- st.write(f"Question: {q}")
76
- st.write(f"Answer: {a}")
77
- except Exception as e:
78
- st.write(f"An error occurred: {e}")
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
 
3
  from io import StringIO
4
  from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering
5
+ import numpy as np
6
 
7
  # Initialize TAPAS model and tokenizer
8
  tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
 
10
 
11
  def ask_llm_chunk(chunk, questions):
12
  chunk = chunk.astype(str)
13
+ inputs = tokenizer(table=chunk, queries=questions, padding="max_length", return_tensors="pt")
14
+
15
+ # Check for token limit
16
+ if inputs["input_ids"].shape[1] > 512:
17
+ st.warning("Token limit exceeded for chunk")
18
+ return ["Token limit exceeded for chunk"] * len(questions)
19
+
20
+ outputs = model(**inputs)
21
+ predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
22
+ inputs,
23
+ outputs.logits.detach(),
24
+ outputs.logits_aggregation.detach()
25
+ )
 
26
  answers = []
27
  for coordinates in predicted_answer_coordinates:
28
  if len(coordinates) == 1:
29
+ answers.append(chunk.iloc[coordinates[0]].values)
30
  else:
31
  cell_values = []
32
  for coordinate in coordinates:
33
+ cell_values.append(chunk.iloc[coordinate].values)
34
  answers.append(", ".join(cell_values))
35
  return answers
36
 
37
+ MAX_ROWS_PER_CHUNK = 200
38
 
39
  def summarize_map_reduce(data, questions):
40
+ dataframe = pd.read_csv(StringIO(data))
 
 
 
 
 
41
  num_chunks = len(dataframe) // MAX_ROWS_PER_CHUNK + 1
42
  dataframe_chunks = np.array_split(dataframe, num_chunks)
43
  all_answers = []
 
62
 
63
  if st.button("Submit"):
64
  if data and questions:
65
+ answers = summarize_map_reduce(data, questions)
66
+ st.write("Answers:")
67
+ for q, a in zip(questions, answers):
68
+ st.write(f"Question: {q}")
69
+ st.write(f"Answer: {a}")
70
+
71
+ # Add Ctrl+Enter functionality for submitting the questions
72
+ st.markdown("""
73
+ <script>
74
+ document.addEventListener("DOMContentLoaded", function(event) {
75
+ document.addEventListener("keydown", function(event) {
76
+ if (event.ctrlKey && event.key === "Enter") {
77
+ document.querySelector(".stButton button").click();
78
+ }
79
+ });
80
+ });
81
+ </script>
82
+ """, unsafe_allow_html=True)