File size: 3,023 Bytes
53d88c3
 
 
 
89dd001
 
 
 
 
 
 
 
cc27c6f
 
89dd001
53d88c3
 
 
 
 
0e0ce94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92a392e
0e0ce94
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering
import pandas as pd
from io import StringIO

from sentence_transformers import SentenceTransformer

retriever = SentenceTransformer("deepset/all-mpnet-base-v2-table")

def embed_table(table):
    processed_table = "\n".join([table.to_csv(index=False)])
    return retriever.encode(processed_table)

def embed_question(question):
    return retriever.encode(question)

def initialize_tapas():
    tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
    model = AutoModelForTableQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq")
    return tokenizer, model

def ask_llm_chunk(tokenizer, model, chunk, questions):
    chunk = chunk.astype(str)
    try:
        inputs = tokenizer(table=chunk, queries=questions, padding="max_length", truncation=True, return_tensors="pt")
    except Exception as e:
        log_debug_info(f"Tokenization error: {e}")
        st.write(f"An error occurred: {e}")
        return ["Error occurred while tokenizing"] * len(questions)

    if inputs["input_ids"].shape[1] > 512:
        log_debug_info("Token limit exceeded for chunk")
        st.warning("Token limit exceeded for chunk")
        return ["Token limit exceeded for chunk"] * len(questions)

    outputs = model(**inputs)
    predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
        inputs,
        outputs.logits.detach(),
        outputs.logits_aggregation.detach()
    )

    answers = []
    for coordinates in predicted_answer_coordinates:
        if len(coordinates) == 1:
            row, col = coordinates[0]
            try:
                value = chunk.iloc[row, col]
                log_debug_info(f"Accessed value for row {row}, col {col}: {value}")
                answers.append(value)
            except Exception as e:
                log_debug_info(f"Error accessing value for row {row}, col {col}: {e}")
                st.write(f"An error occurred: {e}")
        else:
            cell_values = []
            for coordinate in coordinates:
                row, col = coordinate
                try:
                    value = chunk.iloc[row, col]
                    cell_values.append(value)
                except Exception as e:
                    log_debug_info(f"Error accessing value for row {row}, col {col}: {e}")
                    st.write(f"An error occurred: {e}")
            answers.append(", ".join(map(str, cell_values)))

    return answers

MAX_ROWS_PER_CHUNK = 200

def summarize_map_reduce(tokenizer, model, data, questions):
    dataframe = pd.read_csv(StringIO(data))
    num_chunks = len(dataframe) // MAX_ROWS_PER_CHUNK + 1
    dataframe_chunks = [deepcopy(chunk) for chunk in np.array_split(dataframe, num_chunks)]
    all_answers = []
    for chunk in dataframe_chunks:
        chunk_answers = ask_llm_chunk(tokenizer, model, chunk, questions)
        all_answers.extend(chunk_answers)
    return all_answers