Kevin Fink commited on
Commit
f4325ab
·
1 Parent(s): b529f79
Files changed (1) hide show
  1. app.py +32 -25
app.py CHANGED
@@ -2,7 +2,7 @@ import spaces
2
  import gradio as gr
3
  from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
4
  from transformers import DataCollatorForSeq2Seq
5
- from datasets import load_dataset
6
  import traceback
7
  import os
8
  from huggingface_hub import login
@@ -27,33 +27,40 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
27
  tokenizer = AutoTokenizer.from_pretrained(model_name)
28
 
29
  max_length = 64
30
-
31
- # Tokenize the dataset
32
- def tokenize_function(examples):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- # Assuming 'text' is the input and 'target' is the expected output
35
- model_inputs = tokenizer(
36
- examples['text'],
37
- max_length=max_length, # Set to None for dynamic padding
38
- padding=True, # Disable padding here, we will handle it later
39
- truncation=True,
40
- )
41
 
42
- # Setup the decoder input IDs (shifted right)
43
- labels = tokenizer(
44
- examples['target'],
45
- max_length=max_length, # Set to None for dynamic padding
46
- padding=True, # Disable padding here, we will handle it later
47
- truncation=True,
48
- text_target=examples['target'] # Use text_target for target text
49
- )
50
 
51
- # Add labels to the model inputs
52
- model_inputs["labels"] = labels["input_ids"]
53
- return model_inputs
54
-
55
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
56
- data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
57
 
58
  # Set training arguments
59
  training_args = TrainingArguments(
 
2
  import gradio as gr
3
  from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
4
  from transformers import DataCollatorForSeq2Seq
5
+ from datasets import load_dataset, concatenate_datasets, load_from_disk
6
  import traceback
7
  import os
8
  from huggingface_hub import login
 
27
  tokenizer = AutoTokenizer.from_pretrained(model_name)
28
 
29
  max_length = 64
30
+ try:
31
+ tokenized_train_dataset = load_from_disk(f'{hub_id.strip()}_train_dataset')
32
+ tokenized_test_dataset = load_from_disk(f'{hub_id.strip()}_test_dataset')
33
+ tokenized_datasets = concatenate_datasets([tokenized_train_dataset, tokenized_test_dataset])
34
+ except:
35
+ # Tokenize the dataset
36
+ def tokenize_function(examples):
37
+
38
+ # Assuming 'text' is the input and 'target' is the expected output
39
+ model_inputs = tokenizer(
40
+ examples['text'],
41
+ max_length=max_length, # Set to None for dynamic padding
42
+ padding=True, # Disable padding here, we will handle it later
43
+ truncation=True,
44
+ )
45
+
46
+ # Setup the decoder input IDs (shifted right)
47
+ labels = tokenizer(
48
+ examples['target'],
49
+ max_length=max_length, # Set to None for dynamic padding
50
+ padding=True, # Disable padding here, we will handle it later
51
+ truncation=True,
52
+ text_target=examples['target'] # Use text_target for target text
53
+ )
54
 
55
+ # Add labels to the model inputs
56
+ model_inputs["labels"] = labels["input_ids"]
57
+ return model_inputs
 
 
 
 
58
 
59
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
60
+
61
+ tokenized_datasets['train'].save_to_disk(f'{hub_id.strip()}_train_dataset')
62
+ tokenized_datasets['validation'].save_to_disk(f'{hub_id.strip()}_test_dataset')
 
 
 
 
63
 
 
 
 
 
 
 
64
 
65
  # Set training arguments
66
  training_args = TrainingArguments(