File size: 2,914 Bytes
b0e4f45
 
2cc5fad
b0e4f45
 
 
 
 
 
 
 
 
5e71278
 
 
 
 
 
 
 
 
 
 
 
 
6674859
b0e4f45
 
 
 
 
 
 
 
 
 
 
5e71278
b0e4f45
 
5e71278
 
 
 
 
 
b0e4f45
 
 
 
 
 
5e71278
b0e4f45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e71278
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import streamlit as st
import pandas as pd
import numpy as np
from io import StringIO
from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering

# Initialize TAPAS model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
model = AutoModelForTableQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq")

def ask_llm_chunk(chunk, questions):
    chunk = chunk.astype(str)
    try:
        inputs = tokenizer(table=chunk, queries=questions, padding="max_length", return_tensors="pt")
        if inputs["input_ids"].shape[1] > 512:
            return ["Token limit exceeded for this chunk"] * len(questions)
        outputs = model(**inputs)
        predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
            inputs,
            outputs.logits.detach(),
            outputs.logits_aggregation.detach()
        )
    except Exception as e:
        st.write(f"An error occurred: {e}")
        return ["Error processing this chunk"] * len(questions)
    
    answers = []
    for coordinates in predicted_answer_coordinates:
        if len(coordinates) == 1:
            answers.append(chunk.iat[coordinates[0]])
        else:
            cell_values = []
            for coordinate in coordinates:
                cell_values.append(chunk.iat[coordinate])
            answers.append(", ".join(cell_values))
    return answers

MAX_ROWS_PER_CHUNK = 50  # Reduced chunk size

def summarize_map_reduce(data, questions):
    try:
        dataframe = pd.read_csv(StringIO(data))
    except Exception as e:
        st.write(f"Error reading the CSV file: {e}")
        return []
        
    num_chunks = len(dataframe) // MAX_ROWS_PER_CHUNK + 1
    dataframe_chunks = np.array_split(dataframe, num_chunks)
    all_answers = []
    for chunk in dataframe_chunks:
        chunk_answers = ask_llm_chunk(chunk, questions)
        all_answers.extend(chunk_answers)
    return all_answers

st.title("TAPAS Table Question Answering")

# Upload CSV data
csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
if csv_file is not None:
    data = csv_file.read().decode("utf-8")
    st.write("CSV Data Preview:")
    st.write(pd.read_csv(StringIO(data)).head())

    # Input for questions
    questions = st.text_area("Enter your questions (one per line)")
    questions = questions.split("\n")  # split questions by line
    questions = [q for q in questions if q]  # remove empty strings

    if st.button("Submit"):
        if data and questions:
            try:
                answers = summarize_map_reduce(data, questions)
                st.write("Answers:")
                for q, a in zip(questions, answers):
                    st.write(f"Question: {q}")
                    st.write(f"Answer: {a}")
            except Exception as e:
                st.write(f"An error occurred: {e}")