Kevin Fink
commited on
Commit
·
f4fd08e
1
Parent(s):
6527df5
dev
Browse files
app.py
CHANGED
@@ -83,11 +83,41 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
83 |
print("Loading model from checkpoint...")
|
84 |
model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir)
|
85 |
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
try:
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
91 |
|
92 |
# Create Trainer
|
93 |
trainer = Trainer(
|
@@ -99,54 +129,17 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
99 |
)
|
100 |
except:
|
101 |
# Load the dataset
|
102 |
-
dataset = load_dataset(dataset_name.strip())
|
103 |
tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
|
104 |
# Tokenize the dataset
|
105 |
-
def tokenize_function(examples):
|
106 |
-
|
107 |
-
# Assuming 'text' is the input and 'target' is the expected output
|
108 |
-
model_inputs = tokenizer(
|
109 |
-
examples['text'],
|
110 |
-
max_length=max_length, # Set to None for dynamic padding
|
111 |
-
truncation=True,
|
112 |
-
padding='max_length',
|
113 |
-
return_tensors='pt',
|
114 |
-
)
|
115 |
-
|
116 |
-
# Setup the decoder input IDs (shifted right)
|
117 |
-
labels = tokenizer(
|
118 |
-
examples['target'],
|
119 |
-
max_length=max_length, # Set to None for dynamic padding
|
120 |
-
truncation=True,
|
121 |
-
padding='max_length',
|
122 |
-
text_target=examples['target'],
|
123 |
-
return_tensors='pt',
|
124 |
-
)
|
125 |
|
126 |
-
# Add labels to the model inputs
|
127 |
-
model_inputs["labels"] = labels["input_ids"]
|
128 |
-
return model_inputs
|
129 |
|
130 |
-
|
|
|
131 |
|
132 |
-
|
133 |
-
tokenized_datasets['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
|
134 |
-
|
135 |
-
embedding_size = model.get_input_embeddings().weight.shape[0]
|
136 |
-
|
137 |
-
if len(tokenizer) > embedding_size:
|
138 |
-
model.resize_token_embeddings(len(tokenizer))
|
139 |
-
model.resize_position_embeddings(len(tokenizer))
|
140 |
|
141 |
-
|
142 |
-
trainer = Trainer(
|
143 |
-
model=model,
|
144 |
-
args=training_args,
|
145 |
-
train_dataset=tokenized_datasets['train'],
|
146 |
-
eval_dataset=tokenized_datasets['test'],
|
147 |
-
compute_metrics=compute_metrics,
|
148 |
-
#callbacks=[LoggingCallback()],
|
149 |
-
)
|
150 |
|
151 |
# Fine-tune the model
|
152 |
if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
|
|
|
83 |
print("Loading model from checkpoint...")
|
84 |
model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir)
|
85 |
|
86 |
+
def tokenize_function(examples):
|
87 |
+
|
88 |
+
# Assuming 'text' is the input and 'target' is the expected output
|
89 |
+
model_inputs = tokenizer(
|
90 |
+
examples['text'],
|
91 |
+
max_length=max_length, # Set to None for dynamic padding
|
92 |
+
truncation=True,
|
93 |
+
padding='max_length',
|
94 |
+
return_tensors='pt',
|
95 |
+
)
|
96 |
+
|
97 |
+
# Setup the decoder input IDs (shifted right)
|
98 |
+
labels = tokenizer(
|
99 |
+
examples['target'],
|
100 |
+
max_length=max_length, # Set to None for dynamic padding
|
101 |
+
truncation=True,
|
102 |
+
padding='max_length',
|
103 |
+
text_target=examples['target'],
|
104 |
+
return_tensors='pt',
|
105 |
+
)
|
106 |
+
|
107 |
+
# Add labels to the model inputs
|
108 |
+
model_inputs["labels"] = labels["input_ids"]
|
109 |
+
return model_inputs
|
110 |
+
|
111 |
+
#max_length = 512
|
112 |
+
train_size = len(dataset['train'])
|
113 |
+
half_size = train_size // 2
|
114 |
+
max_length = model.get_input_embeddings().weight.shape[0]
|
115 |
try:
|
116 |
+
tokenized_first_half = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
|
117 |
+
second_half = dataset['train'].select(range(half_size, train_size))
|
118 |
+
tokenized_second_half = tokenize_function(second_half)
|
119 |
+
tokenized_train_dataset = concatenate_datasets([tokenized_first_half, tokenized_second_half])
|
120 |
+
tokenized_test_dataset = tokenize_function(dataset['test'])
|
121 |
|
122 |
# Create Trainer
|
123 |
trainer = Trainer(
|
|
|
129 |
)
|
130 |
except:
|
131 |
# Load the dataset
|
132 |
+
dataset = load_dataset(dataset_name.strip())
|
133 |
tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
|
134 |
# Tokenize the dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
|
|
|
|
|
|
136 |
|
137 |
+
first_half = dataset['train'].select(range(half_size))
|
138 |
+
tokenized_half = tokenize_function(first_half)
|
139 |
|
140 |
+
tokenized_half.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
+
return 'RUN AGAIN TO LOAD REST OF DATA'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
# Fine-tune the model
|
145 |
if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
|