Kevin Fink commited on
Commit
33de791
·
1 Parent(s): 75c24c0
Files changed (1) hide show
  1. app.py +13 -27
app.py CHANGED
@@ -3,7 +3,6 @@ import gradio as gr
3
  from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM, TrainerCallback
4
  from transformers import DataCollatorForSeq2Seq
5
  from datasets import load_dataset
6
- from datasets import concatenate_datasets
7
  import traceback
8
  from huggingface_hub import login
9
  from peft import get_peft_model, LoraConfig
@@ -44,43 +43,30 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
44
 
45
  # Tokenize the dataset
46
  def tokenize_function(examples):
47
- # Tokenize the input text
 
48
  model_inputs = tokenizer(
49
  examples['text'],
50
- max_length=max_length,
51
- padding=True,
52
  truncation=True,
53
  )
54
 
55
- # Tokenize the target text
56
  labels = tokenizer(
57
  examples['target'],
58
- max_length=max_length,
59
- padding=True,
60
  truncation=True,
 
61
  )
62
 
63
  # Add labels to the model inputs
64
  model_inputs["labels"] = labels["input_ids"]
65
  return model_inputs
66
-
67
- # Define a function to process the dataset in chunks
68
- def tokenize_in_chunks(dataset, chunk_size=1000):
69
- tokenized_datasets = []
70
-
71
- for i in range(0, len(dataset), chunk_size):
72
- chunk = dataset[i:i + chunk_size]
73
- tokenized_chunk = chunk.map(tokenize_function, batched=True)
74
- tokenized_datasets.append(tokenized_chunk)
75
-
76
- # Concatenate all tokenized chunks into a single dataset
77
- return tokenized_datasets
78
-
79
- # Tokenize the dataset in chunks
80
- tokenized_datasets = tokenize_in_chunks(dataset, chunk_size=1000)
81
-
82
- # If you want to combine all chunks into a single dataset
83
- final_tokenized_dataset = concatenate_datasets(tokenized_datasets)
84
 
85
  # Set training arguments
86
  training_args = TrainingArguments(
@@ -108,8 +94,8 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
108
  trainer = Trainer(
109
  model=model,
110
  args=training_args,
111
- train_dataset=final_tokenized_dataset['train'],
112
- eval_dataset=final_tokenized_dataset['test'],
113
  #callbacks=[LoggingCallback()],
114
  )
115
 
 
3
  from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM, TrainerCallback
4
  from transformers import DataCollatorForSeq2Seq
5
  from datasets import load_dataset
 
6
  import traceback
7
  from huggingface_hub import login
8
  from peft import get_peft_model, LoraConfig
 
43
 
44
  # Tokenize the dataset
45
  def tokenize_function(examples):
46
+
47
+ # Assuming 'text' is the input and 'target' is the expected output
48
  model_inputs = tokenizer(
49
  examples['text'],
50
+ max_length=max_length, # Set to None for dynamic padding
51
+ padding=False, # Disable padding here, we will handle it later
52
  truncation=True,
53
  )
54
 
55
+ # Setup the decoder input IDs (shifted right)
56
  labels = tokenizer(
57
  examples['target'],
58
+ max_length=max_length, # Set to None for dynamic padding
59
+ padding=False, # Disable padding here, we will handle it later
60
  truncation=True,
61
+ text_target=examples['target'] # Use text_target for target text
62
  )
63
 
64
  # Add labels to the model inputs
65
  model_inputs["labels"] = labels["input_ids"]
66
  return model_inputs
67
+
68
+ tokenized_datasets = dataset.map(tokenize_function)
69
+ data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  # Set training arguments
72
  training_args = TrainingArguments(
 
94
  trainer = Trainer(
95
  model=model,
96
  args=training_args,
97
+ train_dataset=data_collator['train'],
98
+ eval_dataset=data_collator['test'],
99
  #callbacks=[LoggingCallback()],
100
  )
101