Spaces:

Short-Answer-Feedback
/

Leaderboard

Sleeping

MCK-02 commited on Jan 26, 2023

Commit

4ec4c31

1 Parent(s): 52963c7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -65,7 +65,7 @@ MAX_INPUT_LENGTH = 256
 MAX_TARGET_LENGTH = 128
-def preprocess_function(examples):
     """
     Preprocess entries of the given dataset
@@ -74,12 +74,14 @@ def preprocess_function(examples):
     Returns:
         model_inputs (BatchEncoding): tokenized dataset entries
     """
     inputs, targets = [], []
     for i in range(len(examples['question'])):
         inputs.append(f"Antwort: {examples['provided_answer'][i]} Lösung: {examples['reference_answer'][i]} Frage: {examples['question'][i]}")
         targets.append(f"{examples['verification_feedback'][i]} Feedback: {examples['answer_feedback'][i]}")
     # apply tokenization to inputs and labels
     model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, padding='max_length', truncation=True)
     labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, padding='max_length', truncation=True)
@@ -200,7 +202,8 @@ def load_data():
         processed_dataset = split.map(
             preprocess_function,
             batched=True,
-            remove_columns=split.column_names
         )
         processed_dataset.set_format('torch')

 MAX_TARGET_LENGTH = 128
+def preprocess_function(examples, **kwargs):
     """
     Preprocess entries of the given dataset
     Returns:
         model_inputs (BatchEncoding): tokenized dataset entries
     """
     inputs, targets = [], []
     for i in range(len(examples['question'])):
         inputs.append(f"Antwort: {examples['provided_answer'][i]} Lösung: {examples['reference_answer'][i]} Frage: {examples['question'][i]}")
         targets.append(f"{examples['verification_feedback'][i]} Feedback: {examples['answer_feedback'][i]}")
     # apply tokenization to inputs and labels
+    tokenizer = kwargs["tokenizer"]
     model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, padding='max_length', truncation=True)
     labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, padding='max_length', truncation=True)
         processed_dataset = split.map(
             preprocess_function,
             batched=True,
+            remove_columns=split.column_names,
+            fn_kwargs={"tokenizer": tokenizer}
         )
         processed_dataset.set_format('torch')