amihai85 commited on
Commit
8b67a67
·
verified ·
1 Parent(s): 5ec8516

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -9
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  from datasets import load_dataset
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
4
 
5
  # Load the dataset
6
  dataset = load_dataset("json", data_files="dataset.jsonl")
@@ -12,29 +12,48 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
 
13
  # Tokenize the dataset
14
  def tokenize_function(examples):
15
- return tokenizer(examples["input"], text_target=examples["output"], truncation=True)
 
 
 
 
 
 
16
 
17
  tokenized_dataset = dataset.map(tokenize_function, batched=True)
18
 
 
 
 
 
 
19
  # Define training arguments
20
  training_args = TrainingArguments(
21
  output_dir="./results",
22
- overwrite_output_dir=True,
23
- eval_strategy="epoch", # Updated
24
- learning_rate=5e-5,
25
- per_device_train_batch_size=2,
26
  num_train_epochs=3,
27
- save_strategy="epoch",
28
  logging_dir="./logs",
29
- logging_strategy="epoch",
 
 
 
 
 
 
 
 
 
 
 
30
  )
31
 
32
- # Trainer setup
33
  trainer = Trainer(
34
  model=model,
35
  args=training_args,
36
  train_dataset=tokenized_dataset["train"],
37
  eval_dataset=tokenized_dataset["train"],
 
38
  )
39
 
40
  # Train the model
 
1
  import gradio as gr
2
  from datasets import load_dataset
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
4
 
5
  # Load the dataset
6
  dataset = load_dataset("json", data_files="dataset.jsonl")
 
12
 
13
  # Tokenize the dataset
14
  def tokenize_function(examples):
15
+ return tokenizer(
16
+ examples["input"],
17
+ text_target=examples["output"],
18
+ truncation=True, # Truncate sequences longer than max_length
19
+ max_length=512, # Adjust this based on your use case
20
+ padding="max_length" # Pad shorter sequences to max_length
21
+ )
22
 
23
  tokenized_dataset = dataset.map(tokenize_function, batched=True)
24
 
25
+ for i, example in enumerate(tokenized_dataset["train"]):
26
+ input_len = len(example["input_ids"])
27
+ output_len = len(example["labels"])
28
+ print(f"Example {i}: Input length = {input_len}, Output length = {output_len}")
29
+
30
  # Define training arguments
31
  training_args = TrainingArguments(
32
  output_dir="./results",
33
+ per_device_train_batch_size=1, # Smaller batch size
34
+ gradient_accumulation_steps=8, # Accumulate gradients to simulate larger batch size
 
 
35
  num_train_epochs=3,
 
36
  logging_dir="./logs",
37
+ logging_strategy="steps",
38
+ save_strategy="epoch",
39
+ eval_strategy="epoch",
40
+ learning_rate=5e-5,
41
+ overwrite_output_dir=True,
42
+ )
43
+
44
+ data_collator = DataCollatorForSeq2Seq(
45
+ tokenizer,
46
+ model=model,
47
+ padding=True, # Enable dynamic padding
48
+ return_tensors="pt"
49
  )
50
 
 
51
  trainer = Trainer(
52
  model=model,
53
  args=training_args,
54
  train_dataset=tokenized_dataset["train"],
55
  eval_dataset=tokenized_dataset["train"],
56
+ data_collator=data_collator, # Use dynamic padding
57
  )
58
 
59
  # Train the model