MCK-02 commited on
Commit
9e33067
·
1 Parent(s): 1d6c749

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -3
app.py CHANGED
@@ -64,15 +64,16 @@ bertscore = load_metric('bertscore')
64
  MAX_INPUT_LENGTH = 256
65
  MAX_TARGET_LENGTH = 128
66
 
67
- def preprocess_function(examples, tokenizer):
68
- """
 
69
  Preprocess entries of the given dataset
70
 
71
  Params:
72
  examples (Dataset): dataset to be preprocessed
73
  Returns:
74
  model_inputs (BatchEncoding): tokenized dataset entries
75
- """
76
  inputs, targets = [], []
77
  for i in range(len(examples['question'])):
78
  inputs.append(f"Antwort: {examples['provided_answer'][i]} Lösung: {examples['reference_answer'][i]} Frage: {examples['question'][i]}")
@@ -85,6 +86,7 @@ def preprocess_function(examples, tokenizer):
85
  model_inputs['labels'] = labels['input_ids']
86
 
87
  return model_inputs
 
88
 
89
 
90
  def flatten_list(l):
@@ -198,6 +200,28 @@ def load_data():
198
  model = AutoModelForSeq2SeqLM.from_pretrained(get_model(ds))
199
  tokenizer = AutoTokenizer.from_pretrained(get_tokenizer(ds))
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  processed_dataset = split.map(
202
  preprocess_function,
203
  batched=True,
 
64
  MAX_INPUT_LENGTH = 256
65
  MAX_TARGET_LENGTH = 128
66
 
67
+ """
68
+ def preprocess_function(examples):
69
+
70
  Preprocess entries of the given dataset
71
 
72
  Params:
73
  examples (Dataset): dataset to be preprocessed
74
  Returns:
75
  model_inputs (BatchEncoding): tokenized dataset entries
76
+
77
  inputs, targets = [], []
78
  for i in range(len(examples['question'])):
79
  inputs.append(f"Antwort: {examples['provided_answer'][i]} Lösung: {examples['reference_answer'][i]} Frage: {examples['question'][i]}")
 
86
  model_inputs['labels'] = labels['input_ids']
87
 
88
  return model_inputs
89
+ """
90
 
91
 
92
  def flatten_list(l):
 
200
  model = AutoModelForSeq2SeqLM.from_pretrained(get_model(ds))
201
  tokenizer = AutoTokenizer.from_pretrained(get_tokenizer(ds))
202
 
203
+ def preprocess_function(examples):
204
+ """
205
+ Preprocess entries of the given dataset
206
+
207
+ Params:
208
+ examples (Dataset): dataset to be preprocessed
209
+ Returns:
210
+ model_inputs (BatchEncoding): tokenized dataset entries
211
+ """
212
+ inputs, targets = [], []
213
+ for i in range(len(examples['question'])):
214
+ inputs.append(f"Antwort: {examples['provided_answer'][i]} Lösung: {examples['reference_answer'][i]} Frage: {examples['question'][i]}")
215
+ targets.append(f"{examples['verification_feedback'][i]} Feedback: {examples['answer_feedback'][i]}")
216
+
217
+ # apply tokenization to inputs and labels
218
+ model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, padding='max_length', truncation=True)
219
+ labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, padding='max_length', truncation=True)
220
+
221
+ model_inputs['labels'] = labels['input_ids']
222
+
223
+ return model_inputs
224
+
225
  processed_dataset = split.map(
226
  preprocess_function,
227
  batched=True,