Spaces:

Short-Answer-Feedback
/

Leaderboard

Sleeping

App Files Files Community

MCK-02 commited on Jan 26, 2023

Commit

9e33067

1 Parent(s): 1d6c749

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -3

app.py CHANGED Viewed

@@ -64,15 +64,16 @@ bertscore = load_metric('bertscore')
 MAX_INPUT_LENGTH = 256
 MAX_TARGET_LENGTH = 128
-def preprocess_function(examples, tokenizer):
-    """
     Preprocess entries of the given dataset
     Params:
         examples (Dataset): dataset to be preprocessed
     Returns:
         model_inputs (BatchEncoding): tokenized dataset entries
-    """
     inputs, targets = [], []
     for i in range(len(examples['question'])):
         inputs.append(f"Antwort: {examples['provided_answer'][i]} Lösung: {examples['reference_answer'][i]} Frage: {examples['question'][i]}")
@@ -85,6 +86,7 @@ def preprocess_function(examples, tokenizer):
     model_inputs['labels'] = labels['input_ids']
     return model_inputs
 def flatten_list(l):
@@ -198,6 +200,28 @@ def load_data():
         model = AutoModelForSeq2SeqLM.from_pretrained(get_model(ds))
         tokenizer = AutoTokenizer.from_pretrained(get_tokenizer(ds))
         processed_dataset = split.map(
             preprocess_function,
             batched=True,

 MAX_INPUT_LENGTH = 256
 MAX_TARGET_LENGTH = 128
+"""
+def preprocess_function(examples):
     Preprocess entries of the given dataset
     Params:
         examples (Dataset): dataset to be preprocessed
     Returns:
         model_inputs (BatchEncoding): tokenized dataset entries
     inputs, targets = [], []
     for i in range(len(examples['question'])):
         inputs.append(f"Antwort: {examples['provided_answer'][i]} Lösung: {examples['reference_answer'][i]} Frage: {examples['question'][i]}")
     model_inputs['labels'] = labels['input_ids']
     return model_inputs
+"""
 def flatten_list(l):
         model = AutoModelForSeq2SeqLM.from_pretrained(get_model(ds))
         tokenizer = AutoTokenizer.from_pretrained(get_tokenizer(ds))
+            def preprocess_function(examples):
+            """
+            Preprocess entries of the given dataset
+            Params:
+            examples (Dataset): dataset to be preprocessed
+            Returns:
+            model_inputs (BatchEncoding): tokenized dataset entries
+            """
+            inputs, targets = [], []
+            for i in range(len(examples['question'])):
+            inputs.append(f"Antwort: {examples['provided_answer'][i]} Lösung: {examples['reference_answer'][i]} Frage: {examples['question'][i]}")
+            targets.append(f"{examples['verification_feedback'][i]} Feedback: {examples['answer_feedback'][i]}")
+            # apply tokenization to inputs and labels
+            model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, padding='max_length', truncation=True)
+            labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, padding='max_length', truncation=True)
+            model_inputs['labels'] = labels['input_ids']
+            return model_inputs
         processed_dataset = split.map(
             preprocess_function,
             batched=True,