Kevin Fink
commited on
Commit
·
ff67bb4
1
Parent(s):
33de791
init
Browse files
app.py
CHANGED
@@ -20,7 +20,7 @@ class LoggingCallback(TrainerCallback):
|
|
20 |
error_rate = 1 - state.best_metric # Assuming best_metric is accuracy
|
21 |
print(f"Current Error Rate: {error_rate:.4f}")
|
22 |
|
23 |
-
@spaces.GPU
|
24 |
def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad):
|
25 |
try:
|
26 |
login(api_key.strip())
|
@@ -35,10 +35,9 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
|
|
35 |
|
36 |
# Load the model and tokenizer
|
37 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name.strip(), num_labels=2)
|
38 |
-
|
39 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
40 |
|
41 |
-
chunk_size = 1000
|
42 |
max_length = 128
|
43 |
|
44 |
# Tokenize the dataset
|
@@ -48,7 +47,7 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
|
|
48 |
model_inputs = tokenizer(
|
49 |
examples['text'],
|
50 |
max_length=max_length, # Set to None for dynamic padding
|
51 |
-
padding=
|
52 |
truncation=True,
|
53 |
)
|
54 |
|
@@ -56,7 +55,7 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
|
|
56 |
labels = tokenizer(
|
57 |
examples['target'],
|
58 |
max_length=max_length, # Set to None for dynamic padding
|
59 |
-
padding=
|
60 |
truncation=True,
|
61 |
text_target=examples['target'] # Use text_target for target text
|
62 |
)
|
@@ -65,7 +64,7 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
|
|
65 |
model_inputs["labels"] = labels["input_ids"]
|
66 |
return model_inputs
|
67 |
|
68 |
-
tokenized_datasets = dataset.map(tokenize_function)
|
69 |
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
70 |
|
71 |
# Set training arguments
|
|
|
20 |
error_rate = 1 - state.best_metric # Assuming best_metric is accuracy
|
21 |
print(f"Current Error Rate: {error_rate:.4f}")
|
22 |
|
23 |
+
@spaces.GPU(duration=1800)
|
24 |
def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad):
|
25 |
try:
|
26 |
login(api_key.strip())
|
|
|
35 |
|
36 |
# Load the model and tokenizer
|
37 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name.strip(), num_labels=2)
|
38 |
+
model = get_peft_model(model, lora_config)
|
39 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
40 |
|
|
|
41 |
max_length = 128
|
42 |
|
43 |
# Tokenize the dataset
|
|
|
47 |
model_inputs = tokenizer(
|
48 |
examples['text'],
|
49 |
max_length=max_length, # Set to None for dynamic padding
|
50 |
+
padding=True, # Disable padding here, we will handle it later
|
51 |
truncation=True,
|
52 |
)
|
53 |
|
|
|
55 |
labels = tokenizer(
|
56 |
examples['target'],
|
57 |
max_length=max_length, # Set to None for dynamic padding
|
58 |
+
padding=True, # Disable padding here, we will handle it later
|
59 |
truncation=True,
|
60 |
text_target=examples['target'] # Use text_target for target text
|
61 |
)
|
|
|
64 |
model_inputs["labels"] = labels["input_ids"]
|
65 |
return model_inputs
|
66 |
|
67 |
+
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
68 |
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
69 |
|
70 |
# Set training arguments
|