Spaces:
Build error
Build error
from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering | |
import pandas as pd | |
from io import StringIO | |
from sentence_transformers import SentenceTransformer | |
retriever = SentenceTransformer("deepset/all-mpnet-base-v2-table") | |
def embed_table(table): | |
processed_table = "\n".join([table.to_csv(index=False)]) | |
return retriever.encode(processed_table) | |
def embed_question(question): | |
return retriever.encode(question) | |
def initialize_tapas(): | |
tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq") | |
model = AutoModelForTableQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq") | |
return tokenizer, model | |
def ask_llm_chunk(tokenizer, model, chunk, questions): | |
chunk = chunk.astype(str) | |
try: | |
inputs = tokenizer(table=chunk, queries=questions, padding="max_length", truncation=True, return_tensors="pt") | |
except Exception as e: | |
log_debug_info(f"Tokenization error: {e}") | |
st.write(f"An error occurred: {e}") | |
return ["Error occurred while tokenizing"] * len(questions) | |
if inputs["input_ids"].shape[1] > 512: | |
log_debug_info("Token limit exceeded for chunk") | |
st.warning("Token limit exceeded for chunk") | |
return ["Token limit exceeded for chunk"] * len(questions) | |
outputs = model(**inputs) | |
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( | |
inputs, | |
outputs.logits.detach(), | |
outputs.logits_aggregation.detach() | |
) | |
answers = [] | |
for coordinates in predicted_answer_coordinates: | |
if len(coordinates) == 1: | |
row, col = coordinates[0] | |
try: | |
value = chunk.iloc[row, col] | |
log_debug_info(f"Accessed value for row {row}, col {col}: {value}") | |
answers.append(value) | |
except Exception as e: | |
log_debug_info(f"Error accessing value for row {row}, col {col}: {e}") | |
st.write(f"An error occurred: {e}") | |
else: | |
cell_values = [] | |
for coordinate in coordinates: | |
row, col = coordinate | |
try: | |
value = chunk.iloc[row, col] | |
cell_values.append(value) | |
except Exception as e: | |
log_debug_info(f"Error accessing value for row {row}, col {col}: {e}") | |
st.write(f"An error occurred: {e}") | |
answers.append(", ".join(map(str, cell_values))) | |
return answers | |
MAX_ROWS_PER_CHUNK = 200 | |
def summarize_map_reduce(tokenizer, model, data, questions): | |
dataframe = pd.read_csv(StringIO(data)) | |
num_chunks = len(dataframe) // MAX_ROWS_PER_CHUNK + 1 | |
dataframe_chunks = [deepcopy(chunk) for chunk in np.array_split(dataframe, num_chunks)] | |
all_answers = [] | |
for chunk in dataframe_chunks: | |
chunk_answers = ask_llm_chunk(tokenizer, model, chunk, questions) | |
all_answers.extend(chunk_answers) | |
return all_answers | |